Discussion:
[01/51] [partial] mahout git commit: NO-JIRA Clean up MR refactor
r***@apache.org
2018-06-28 14:54:29 UTC
Permalink
Repository: mahout
Updated Branches:
refs/heads/branch-0.14.0 e0573de33 -> 410ed16af


http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/sequencefile/PathFilters.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/sequencefile/PathFilters.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/sequencefile/PathFilters.java
new file mode 100644
index 0000000..19f78b5
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/sequencefile/PathFilters.java
@@ -0,0 +1,81 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common.iterator.sequencefile;
+
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.PathFilter;
+
+/**
+ * Supplies some useful and repeatedly-used instances of {@link PathFilter}.
+ */
+public final class PathFilters {
+
+ private static final PathFilter PART_FILE_INSTANCE = new PathFilter() {
+ @Override
+ public boolean accept(Path path) {
+ String name = path.getName();
+ return name.startsWith("part-") && !name.endsWith(".crc");
+ }
+ };
+
+ /**
+ * Pathfilter to read the final clustering file.
+ */
+ private static final PathFilter CLUSTER_FINAL = new PathFilter() {
+ @Override
+ public boolean accept(Path path) {
+ String name = path.getName();
+ return name.startsWith("clusters-") && name.endsWith("-final");
+ }
+ };
+
+ private static final PathFilter LOGS_CRC_INSTANCE = new PathFilter() {
+ @Override
+ public boolean accept(Path path) {
+ String name = path.getName();
+ return !(name.endsWith(".crc") || name.startsWith(".") || name.startsWith("_"));
+ }
+ };
+
+ private PathFilters() {
+ }
+
+ /**
+ * @return {@link PathFilter} that accepts paths whose file name starts with "part-". Excludes
+ * ".crc" files.
+ */
+ public static PathFilter partFilter() {
+ return PART_FILE_INSTANCE;
+ }
+
+ /**
+ * @return {@link PathFilter} that accepts paths whose file name starts with "part-" and ends with "-final".
+ */
+ public static PathFilter finalPartFilter() {
+ return CLUSTER_FINAL;
+ }
+
+ /**
+ * @return {@link PathFilter} that rejects paths whose file name starts with "_" (e.g. Cloudera
+ * _SUCCESS files or Hadoop _logs), or "." (e.g. local hidden files), or ends with ".crc"
+ */
+ public static PathFilter logsCRCFilter() {
+ return LOGS_CRC_INSTANCE;
+ }
+
+}
r***@apache.org
2018-06-28 14:54:31 UTC
Permalink
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/AbstractJob.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/AbstractJob.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/AbstractJob.java
new file mode 100644
index 0000000..8072466
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/AbstractJob.java
@@ -0,0 +1,648 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.TreeMap;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import com.google.common.base.Preconditions;
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.OptionException;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.commandline.Parser;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapreduce.InputFormat;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.OutputFormat;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.hadoop.util.Tool;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.common.lucene.AnalyzerUtils;
+import org.apache.mahout.math.VectorWritable;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * <p>Superclass of many Mahout Hadoop "jobs". A job drives configuration and launch of one or
+ * more maps and reduces in order to accomplish some task.</p>
+ *
+ * <p>Command line arguments available to all subclasses are:</p>
+ *
+ * <ul>
+ * <li>--tempDir (path): Specifies a directory where the job may place temp files
+ * (default "temp")</li>
+ * <li>--help: Show help message</li>
+ * </ul>
+ *
+ * <p>In addition, note some key command line parameters that are parsed by Hadoop, which jobs
+ * may need to set:</p>
+ *
+ * <ul>
+ * <li>-Dmapred.job.name=(name): Sets the Hadoop task names. It will be suffixed by
+ * the mapper and reducer class names</li>
+ * <li>-Dmapred.output.compress={true,false}: Compress final output (default true)</li>
+ * <li>-Dmapred.input.dir=(path): input file, or directory containing input files (required)</li>
+ * <li>-Dmapred.output.dir=(path): path to write output files (required)</li>
+ * </ul>
+ *
+ * <p>Note that because of how Hadoop parses arguments, all "-D" arguments must appear before all other
+ * arguments.</p>
+ */
+public abstract class AbstractJob extends Configured implements Tool {
+
+ private static final Logger log = LoggerFactory.getLogger(AbstractJob.class);
+
+ /** option used to specify the input path */
+ private Option inputOption;
+
+ /** option used to specify the output path */
+ private Option outputOption;
+
+ /** input path, populated by {@link #parseArguments(String[])} */
+ protected Path inputPath;
+ protected File inputFile; //the input represented as a file
+
+ /** output path, populated by {@link #parseArguments(String[])} */
+ protected Path outputPath;
+ protected File outputFile; //the output represented as a file
+
+ /** temp path, populated by {@link #parseArguments(String[])} */
+ protected Path tempPath;
+
+ protected Map<String, List<String>> argMap;
+
+ /** internal list of options that have been added */
+ private final List<Option> options;
+ private Group group;
+
+ protected AbstractJob() {
+ options = new LinkedList<>();
+ }
+
+ /** Returns the input path established by a call to {@link #parseArguments(String[])}.
+ * The source of the path may be an input option added using {@link #addInputOption()}
+ * or it may be the value of the {@code mapred.input.dir} configuration
+ * property.
+ */
+ protected Path getInputPath() {
+ return inputPath;
+ }
+
+ /** Returns the output path established by a call to {@link #parseArguments(String[])}.
+ * The source of the path may be an output option added using {@link #addOutputOption()}
+ * or it may be the value of the {@code mapred.input.dir} configuration
+ * property.
+ */
+ protected Path getOutputPath() {
+ return outputPath;
+ }
+
+ protected Path getOutputPath(String path) {
+ return new Path(outputPath, path);
+ }
+
+ protected File getInputFile() {
+ return inputFile;
+ }
+
+ protected File getOutputFile() {
+ return outputFile;
+ }
+
+
+ protected Path getTempPath() {
+ return tempPath;
+ }
+
+ protected Path getTempPath(String directory) {
+ return new Path(tempPath, directory);
+ }
+
+ @Override
+ public Configuration getConf() {
+ Configuration result = super.getConf();
+ if (result == null) {
+ return new Configuration();
+ }
+ return result;
+ }
+
+ /** Add an option with no argument whose presence can be checked for using
+ * {@code containsKey} method on the map returned by {@link #parseArguments(String[])};
+ */
+ protected void addFlag(String name, String shortName, String description) {
+ options.add(buildOption(name, shortName, description, false, false, null));
+ }
+
+ /** Add an option to the the set of options this job will parse when
+ * {@link #parseArguments(String[])} is called. This options has an argument
+ * with null as its default value.
+ */
+ protected void addOption(String name, String shortName, String description) {
+ options.add(buildOption(name, shortName, description, true, false, null));
+ }
+
+ /** Add an option to the the set of options this job will parse when
+ * {@link #parseArguments(String[])} is called.
+ *
+ * @param required if true the {@link #parseArguments(String[])} will throw
+ * fail with an error and usage message if this option is not specified
+ * on the command line.
+ */
+ protected void addOption(String name, String shortName, String description, boolean required) {
+ options.add(buildOption(name, shortName, description, true, required, null));
+ }
+
+ /** Add an option to the the set of options this job will parse when
+ * {@link #parseArguments(String[])} is called. If this option is not
+ * specified on the command line the default value will be
+ * used.
+ *
+ * @param defaultValue the default argument value if this argument is not
+ * found on the command-line. null is allowed.
+ */
+ protected void addOption(String name, String shortName, String description, String defaultValue) {
+ options.add(buildOption(name, shortName, description, true, false, defaultValue));
+ }
+
+ /** Add an arbitrary option to the set of options this job will parse when
+ * {@link #parseArguments(String[])} is called. If this option has no
+ * argument, use {@code containsKey} on the map returned by
+ * {@code parseArguments} to check for its presence. Otherwise, the
+ * string value of the option will be placed in the map using a key
+ * equal to this options long name preceded by '--'.
+ * @return the option added.
+ */
+ protected Option addOption(Option option) {
+ options.add(option);
+ return option;
+ }
+
+ protected Group getGroup() {
+ return group;
+ }
+
+ /** Add the default input directory option, '-i' which takes a directory
+ * name as an argument. When {@link #parseArguments(String[])} is
+ * called, the inputPath will be set based upon the value for this option.
+ * If this method is called, the input is required.
+ */
+ protected void addInputOption() {
+ this.inputOption = addOption(DefaultOptionCreator.inputOption().create());
+ }
+
+ /** Add the default output directory option, '-o' which takes a directory
+ * name as an argument. When {@link #parseArguments(String[])} is
+ * called, the outputPath will be set based upon the value for this option.
+ * If this method is called, the output is required.
+ */
+ protected void addOutputOption() {
+ this.outputOption = addOption(DefaultOptionCreator.outputOption().create());
+ }
+
+ /** Build an option with the given parameters. Name and description are
+ * required.
+ *
+ * @param name the long name of the option prefixed with '--' on the command-line
+ * @param shortName the short name of the option, prefixed with '-' on the command-line
+ * @param description description of the option displayed in help method
+ * @param hasArg true if the option has an argument.
+ * @param required true if the option is required.
+ * @param defaultValue default argument value, can be null.
+ * @return the option.
+ */
+ protected static Option buildOption(String name,
+ String shortName,
+ String description,
+ boolean hasArg,
+ boolean required,
+ String defaultValue) {
+
+ return buildOption(name, shortName, description, hasArg, 1, 1, required, defaultValue);
+ }
+
+ protected static Option buildOption(String name,
+ String shortName,
+ String description,
+ boolean hasArg, int min, int max,
+ boolean required,
+ String defaultValue) {
+
+ DefaultOptionBuilder optBuilder = new DefaultOptionBuilder().withLongName(name).withDescription(description)
+ .withRequired(required);
+
+ if (shortName != null) {
+ optBuilder.withShortName(shortName);
+ }
+
+ if (hasArg) {
+ ArgumentBuilder argBuilder = new ArgumentBuilder().withName(name).withMinimum(min).withMaximum(max);
+
+ if (defaultValue != null) {
+ argBuilder = argBuilder.withDefault(defaultValue);
+ }
+
+ optBuilder.withArgument(argBuilder.create());
+ }
+
+ return optBuilder.create();
+ }
+
+ /**
+ * @param name The name of the option
+ * @return the {@link org.apache.commons.cli2.Option} with the name, else null
+ */
+ protected Option getCLIOption(String name) {
+ for (Option option : options) {
+ if (option.getPreferredName().equals(name)) {
+ return option;
+ }
+ }
+ return null;
+ }
+
+ /** Parse the arguments specified based on the options defined using the
+ * various {@code addOption} methods. If -h is specified or an
+ * exception is encountered print help and return null. Has the
+ * side effect of setting inputPath and outputPath
+ * if {@code addInputOption} or {@code addOutputOption}
+ * or {@code mapred.input.dir} or {@code mapred.output.dir}
+ * are present in the Configuration.
+ *
+ * @return a {@code Map<String,String>} containing options and their argument values.
+ * The presence of a flag can be tested using {@code containsKey}, while
+ * argument values can be retrieved using {@code get(optionName)}. The
+ * names used for keys are the option name parameter prefixed by '--'.
+ *
+ * @see #parseArguments(String[], boolean, boolean) -- passes in false, false for the optional args.
+ */
+ public Map<String, List<String>> parseArguments(String[] args) throws IOException {
+ return parseArguments(args, false, false);
+ }
+
+ /**
+ *
+ * @param args The args to parse
+ * @param inputOptional if false, then the input option, if set, need not be present. If true and input is an option
+ * and there is no input, then throw an error
+ * @param outputOptional if false, then the output option, if set, need not be present. If true and output is an
+ * option and there is no output, then throw an error
+ * @return the args parsed into a map.
+ */
+ public Map<String, List<String>> parseArguments(String[] args, boolean inputOptional, boolean outputOptional)
+ throws IOException {
+ Option helpOpt = addOption(DefaultOptionCreator.helpOption());
+ addOption("tempDir", null, "Intermediate output directory", "temp");
+ addOption("startPhase", null, "First phase to run", "0");
+ addOption("endPhase", null, "Last phase to run", String.valueOf(Integer.MAX_VALUE));
+
+ GroupBuilder gBuilder = new GroupBuilder().withName("Job-Specific Options:");
+
+ for (Option opt : options) {
+ gBuilder = gBuilder.withOption(opt);
+ }
+
+ group = gBuilder.create();
+
+ CommandLine cmdLine;
+ try {
+ Parser parser = new Parser();
+ parser.setGroup(group);
+ parser.setHelpOption(helpOpt);
+ cmdLine = parser.parse(args);
+
+ } catch (OptionException e) {
+ log.error(e.getMessage());
+ CommandLineUtil.printHelpWithGenericOptions(group, e);
+ return null;
+ }
+
+ if (cmdLine.hasOption(helpOpt)) {
+ CommandLineUtil.printHelpWithGenericOptions(group);
+ return null;
+ }
+
+ try {
+ parseDirectories(cmdLine, inputOptional, outputOptional);
+ } catch (IllegalArgumentException e) {
+ log.error(e.getMessage());
+ CommandLineUtil.printHelpWithGenericOptions(group);
+ return null;
+ }
+
+ argMap = new TreeMap<>();
+ maybePut(argMap, cmdLine, this.options.toArray(new Option[this.options.size()]));
+
+ this.tempPath = new Path(getOption("tempDir"));
+
+ if (!hasOption("quiet")) {
+ log.info("Command line arguments: {}", argMap);
+ }
+ return argMap;
+ }
+
+ /**
+ * Build the option key (--name) from the option name
+ */
+ public static String keyFor(String optionName) {
+ return "--" + optionName;
+ }
+
+ /**
+ * @return the requested option, or null if it has not been specified
+ */
+ public String getOption(String optionName) {
+ List<String> list = argMap.get(keyFor(optionName));
+ if (list != null && !list.isEmpty()) {
+ return list.get(0);
+ }
+ return null;
+ }
+
+ /**
+ * Get the option, else the default
+ * @param optionName The name of the option to look up, without the --
+ * @param defaultVal The default value.
+ * @return The requested option, else the default value if it doesn't exist
+ */
+ public String getOption(String optionName, String defaultVal) {
+ String res = getOption(optionName);
+ if (res == null) {
+ res = defaultVal;
+ }
+ return res;
+ }
+
+ public int getInt(String optionName) {
+ return Integer.parseInt(getOption(optionName));
+ }
+
+ public int getInt(String optionName, int defaultVal) {
+ return Integer.parseInt(getOption(optionName, String.valueOf(defaultVal)));
+ }
+
+ public float getFloat(String optionName) {
+ return Float.parseFloat(getOption(optionName));
+ }
+
+ public float getFloat(String optionName, float defaultVal) {
+ return Float.parseFloat(getOption(optionName, String.valueOf(defaultVal)));
+ }
+
+ /**
+ * Options can occur multiple times, so return the list
+ * @param optionName The unadorned (no "--" prefixing it) option name
+ * @return The values, else null. If the option is present, but has no values, then the result will be an
+ * empty list (Collections.emptyList())
+ */
+ public List<String> getOptions(String optionName) {
+ return argMap.get(keyFor(optionName));
+ }
+
+ /**
+ * @return if the requested option has been specified
+ */
+ public boolean hasOption(String optionName) {
+ return argMap.containsKey(keyFor(optionName));
+ }
+
+
+ /**
+ * Get the cardinality of the input vectors
+ *
+ * @param matrix
+ * @return the cardinality of the vector
+ */
+ public int getDimensions(Path matrix) throws IOException {
+ try (SequenceFile.Reader reader = new SequenceFile.Reader(FileSystem.get(getConf()), matrix, getConf())){
+ Writable row = ClassUtils.instantiateAs(reader.getKeyClass().asSubclass(Writable.class), Writable.class);
+ Preconditions.checkArgument(reader.getValueClass().equals(VectorWritable.class),
+ "value type of sequencefile must be a VectorWritable");
+
+ VectorWritable vectorWritable = new VectorWritable();
+ boolean hasAtLeastOneRow = reader.next(row, vectorWritable);
+ Preconditions.checkState(hasAtLeastOneRow, "matrix must have at least one row");
+ return vectorWritable.get().size();
+ }
+ }
+
+ /**
+ * Obtain input and output directories from command-line options or hadoop
+ * properties. If {@code addInputOption} or {@code addOutputOption}
+ * has been called, this method will throw an {@code OptionException} if
+ * no source (command-line or property) for that value is present.
+ * Otherwise, {@code inputPath} or {@code outputPath} will be
+ * non-null only if specified as a hadoop property. Command-line options
+ * take precedence over hadoop properties.
+ *
+ * @throws IllegalArgumentException if either inputOption is present,
+ * and neither {@code --input} nor {@code -Dmapred.input dir} are
+ * specified or outputOption is present and neither {@code --output}
+ * nor {@code -Dmapred.output.dir} are specified.
+ */
+ protected void parseDirectories(CommandLine cmdLine, boolean inputOptional, boolean outputOptional) {
+
+ Configuration conf = getConf();
+
+ if (inputOption != null && cmdLine.hasOption(inputOption)) {
+ this.inputPath = new Path(cmdLine.getValue(inputOption).toString());
+ this.inputFile = new File(cmdLine.getValue(inputOption).toString());
+ }
+ if (inputPath == null && conf.get("mapred.input.dir") != null) {
+ this.inputPath = new Path(conf.get("mapred.input.dir"));
+ }
+
+ if (outputOption != null && cmdLine.hasOption(outputOption)) {
+ this.outputPath = new Path(cmdLine.getValue(outputOption).toString());
+ this.outputFile = new File(cmdLine.getValue(outputOption).toString());
+ }
+ if (outputPath == null && conf.get("mapred.output.dir") != null) {
+ this.outputPath = new Path(conf.get("mapred.output.dir"));
+ }
+
+ Preconditions.checkArgument(inputOptional || inputOption == null || inputPath != null,
+ "No input specified or -Dmapred.input.dir must be provided to specify input directory");
+ Preconditions.checkArgument(outputOptional || outputOption == null || outputPath != null,
+ "No output specified: or -Dmapred.output.dir must be provided to specify output directory");
+ }
+
+ protected static void maybePut(Map<String, List<String>> args, CommandLine cmdLine, Option... opt) {
+ for (Option o : opt) {
+
+ // the option appeared on the command-line, or it has a value
+ // (which is likely a default value).
+ if (cmdLine.hasOption(o) || cmdLine.getValue(o) != null
+ || (cmdLine.getValues(o) != null && !cmdLine.getValues(o).isEmpty())) {
+
+ // nulls are ok, for cases where options are simple flags.
+ List<?> vo = cmdLine.getValues(o);
+ if (vo != null && !vo.isEmpty()) {
+ List<String> vals = new ArrayList<>();
+ for (Object o1 : vo) {
+ vals.add(o1.toString());
+ }
+ args.put(o.getPreferredName(), vals);
+ } else {
+ args.put(o.getPreferredName(), null);
+ }
+ }
+ }
+ }
+
+ /**
+ *
+ * @param args The input argument map
+ * @param optName The adorned (including "--") option name
+ * @return The first value in the match, else null
+ */
+ public static String getOption(Map<String, List<String>> args, String optName) {
+ List<String> res = args.get(optName);
+ if (res != null && !res.isEmpty()) {
+ return res.get(0);
+ }
+ return null;
+ }
+
+
+ protected static boolean shouldRunNextPhase(Map<String, List<String>> args, AtomicInteger currentPhase) {
+ int phase = currentPhase.getAndIncrement();
+ String startPhase = getOption(args, "--startPhase");
+ String endPhase = getOption(args, "--endPhase");
+ boolean phaseSkipped = (startPhase != null && phase < Integer.parseInt(startPhase))
+ || (endPhase != null && phase > Integer.parseInt(endPhase));
+ if (phaseSkipped) {
+ log.info("Skipping phase {}", phase);
+ }
+ return !phaseSkipped;
+ }
+
+ protected Job prepareJob(Path inputPath,
+ Path outputPath,
+ Class<? extends InputFormat> inputFormat,
+ Class<? extends Mapper> mapper,
+ Class<? extends Writable> mapperKey,
+ Class<? extends Writable> mapperValue,
+ Class<? extends OutputFormat> outputFormat) throws IOException {
+ return prepareJob(inputPath, outputPath, inputFormat, mapper, mapperKey, mapperValue, outputFormat, null);
+
+ }
+ protected Job prepareJob(Path inputPath,
+ Path outputPath,
+ Class<? extends InputFormat> inputFormat,
+ Class<? extends Mapper> mapper,
+ Class<? extends Writable> mapperKey,
+ Class<? extends Writable> mapperValue,
+ Class<? extends OutputFormat> outputFormat,
+ String jobname) throws IOException {
+
+ Job job = HadoopUtil.prepareJob(inputPath, outputPath,
+ inputFormat, mapper, mapperKey, mapperValue, outputFormat, getConf());
+
+ String name =
+ jobname != null ? jobname : HadoopUtil.getCustomJobName(getClass().getSimpleName(), job, mapper, Reducer.class);
+
+ job.setJobName(name);
+ return job;
+
+ }
+
+ protected Job prepareJob(Path inputPath, Path outputPath, Class<? extends Mapper> mapper,
+ Class<? extends Writable> mapperKey, Class<? extends Writable> mapperValue, Class<? extends Reducer> reducer,
+ Class<? extends Writable> reducerKey, Class<? extends Writable> reducerValue) throws IOException {
+ return prepareJob(inputPath, outputPath, SequenceFileInputFormat.class, mapper, mapperKey, mapperValue, reducer,
+ reducerKey, reducerValue, SequenceFileOutputFormat.class);
+ }
+
+ protected Job prepareJob(Path inputPath,
+ Path outputPath,
+ Class<? extends InputFormat> inputFormat,
+ Class<? extends Mapper> mapper,
+ Class<? extends Writable> mapperKey,
+ Class<? extends Writable> mapperValue,
+ Class<? extends Reducer> reducer,
+ Class<? extends Writable> reducerKey,
+ Class<? extends Writable> reducerValue,
+ Class<? extends OutputFormat> outputFormat) throws IOException {
+ Job job = HadoopUtil.prepareJob(inputPath, outputPath,
+ inputFormat, mapper, mapperKey, mapperValue, reducer, reducerKey, reducerValue, outputFormat, getConf());
+ job.setJobName(HadoopUtil.getCustomJobName(getClass().getSimpleName(), job, mapper, Reducer.class));
+ return job;
+ }
+
+ /**
+ * necessary to make this job (having a combined input path) work on Amazon S3, hopefully this is
+ * obsolete when MultipleInputs is available again
+ */
+ public static void setS3SafeCombinedInputPath(Job job, Path referencePath, Path inputPathOne, Path inputPathTwo)
+ throws IOException {
+ FileSystem fs = FileSystem.get(referencePath.toUri(), job.getConfiguration());
+ FileInputFormat.setInputPaths(job, inputPathOne.makeQualified(fs), inputPathTwo.makeQualified(fs));
+ }
+
+ protected Class<? extends Analyzer> getAnalyzerClassFromOption() throws ClassNotFoundException {
+ Class<? extends Analyzer> analyzerClass = StandardAnalyzer.class;
+ if (hasOption(DefaultOptionCreator.ANALYZER_NAME_OPTION)) {
+ String className = getOption(DefaultOptionCreator.ANALYZER_NAME_OPTION);
+ analyzerClass = Class.forName(className).asSubclass(Analyzer.class);
+ // try instantiating it, b/c there isn't any point in setting it if
+ // you can't instantiate it
+ //ClassUtils.instantiateAs(analyzerClass, Analyzer.class);
+ AnalyzerUtils.createAnalyzer(analyzerClass);
+ }
+ return analyzerClass;
+ }
+
+ /**
+ * Overrides the base implementation to install the Oozie action configuration resource
+ * into the provided Configuration object; note that ToolRunner calls setConf on the Tool
+ * before it invokes run.
+ */
+ @Override
+ public void setConf(Configuration conf) {
+ super.setConf(conf);
+
+ // If running in an Oozie workflow as a Java action, need to add the
+ // Configuration resource provided by Oozie to this job's config.
+ String oozieActionConfXml = System.getProperty("oozie.action.conf.xml");
+ if (oozieActionConfXml != null && conf != null) {
+ conf.addResource(new Path("file:///", oozieActionConfXml));
+ log.info("Added Oozie action Configuration resource {} to the Hadoop Configuration", oozieActionConfXml);
+ }
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/ClassUtils.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/ClassUtils.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/ClassUtils.java
new file mode 100644
index 0000000..8052ef1
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/ClassUtils.java
@@ -0,0 +1,61 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common;
+
+import java.lang.reflect.InvocationTargetException;
+
+public final class ClassUtils {
+
+ private ClassUtils() {}
+
+ public static <T> T instantiateAs(String classname, Class<T> asSubclassOfClass) {
+ try {
+ return instantiateAs(Class.forName(classname).asSubclass(asSubclassOfClass), asSubclassOfClass);
+ } catch (ClassNotFoundException e) {
+ throw new IllegalStateException(e);
+ }
+ }
+
+ public static <T> T instantiateAs(String classname, Class<T> asSubclassOfClass, Class<?>[] params, Object[] args) {
+ try {
+ return instantiateAs(Class.forName(classname).asSubclass(asSubclassOfClass), asSubclassOfClass, params, args);
+ } catch (ClassNotFoundException e) {
+ throw new IllegalStateException(e);
+ }
+ }
+
+ public static <T> T instantiateAs(Class<? extends T> clazz,
+ Class<T> asSubclassOfClass,
+ Class<?>[] params,
+ Object[] args) {
+ try {
+ return clazz.asSubclass(asSubclassOfClass).getConstructor(params).newInstance(args);
+ } catch (InstantiationException | IllegalAccessException | NoSuchMethodException | InvocationTargetException ie) {
+ throw new IllegalStateException(ie);
+ }
+ }
+
+
+ public static <T> T instantiateAs(Class<? extends T> clazz, Class<T> asSubclassOfClass) {
+ try {
+ return clazz.asSubclass(asSubclassOfClass).getConstructor().newInstance();
+ } catch (InstantiationException | IllegalAccessException | NoSuchMethodException | InvocationTargetException ie) {
+ throw new IllegalStateException(ie);
+ }
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/CommandLineUtil.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/CommandLineUtil.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/CommandLineUtil.java
new file mode 100644
index 0000000..ac4ab88
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/CommandLineUtil.java
@@ -0,0 +1,68 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common;
+
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.PrintWriter;
+
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.OptionException;
+import org.apache.commons.cli2.util.HelpFormatter;
+import org.apache.commons.io.Charsets;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.util.GenericOptionsParser;
+
+public final class CommandLineUtil {
+
+ private CommandLineUtil() { }
+
+ public static void printHelp(Group group) {
+ HelpFormatter formatter = new HelpFormatter();
+ formatter.setGroup(group);
+ formatter.print();
+ }
+
+ /**
+ * Print the options supported by {@code GenericOptionsParser}.
+ * In addition to the options supported by the job, passed in as the
+ * group parameter.
+ *
+ * @param group job-specific command-line options.
+ */
+ public static void printHelpWithGenericOptions(Group group) throws IOException {
+ new GenericOptionsParser(new Configuration(), new org.apache.commons.cli.Options(), new String[0]);
+ PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, Charsets.UTF_8), true);
+ HelpFormatter formatter = new HelpFormatter();
+ formatter.setGroup(group);
+ formatter.setPrintWriter(pw);
+ formatter.setFooter("Specify HDFS directories while running on hadoop; else specify local file system directories");
+ formatter.print();
+ }
+
+ public static void printHelpWithGenericOptions(Group group, OptionException oe) throws IOException {
+ new GenericOptionsParser(new Configuration(), new org.apache.commons.cli.Options(), new String[0]);
+ PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, Charsets.UTF_8), true);
+ HelpFormatter formatter = new HelpFormatter();
+ formatter.setGroup(group);
+ formatter.setPrintWriter(pw);
+ formatter.setException(oe);
+ formatter.print();
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/HadoopUtil.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/HadoopUtil.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/HadoopUtil.java
new file mode 100644
index 0000000..34515aa
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/HadoopUtil.java
@@ -0,0 +1,435 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URI;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Comparator;
+import java.util.Iterator;
+import java.util.List;
+
+import com.google.common.base.Joiner;
+import com.google.common.base.Preconditions;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.filecache.DistributedCache;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.LocalFileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.PathFilter;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapreduce.InputFormat;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.JobContext;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.OutputFormat;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterator;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileValueIterator;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public final class HadoopUtil {
+
+ private static final Logger log = LoggerFactory.getLogger(HadoopUtil.class);
+
+ private HadoopUtil() { }
+
+ /**
+ * Create a map-only Hadoop Job out of the passed in parameters. Does not set the
+ * Job name.
+ *
+ * @see #getCustomJobName(String, org.apache.hadoop.mapreduce.JobContext, Class, Class)
+ */
+ public static Job prepareJob(Path inputPath,
+ Path outputPath,
+ Class<? extends InputFormat> inputFormat,
+ Class<? extends Mapper> mapper,
+ Class<? extends Writable> mapperKey,
+ Class<? extends Writable> mapperValue,
+ Class<? extends OutputFormat> outputFormat, Configuration conf) throws IOException {
+
+ Job job = new Job(new Configuration(conf));
+ Configuration jobConf = job.getConfiguration();
+
+ if (mapper.equals(Mapper.class)) {
+ throw new IllegalStateException("Can't figure out the user class jar file from mapper/reducer");
+ }
+ job.setJarByClass(mapper);
+
+ job.setInputFormatClass(inputFormat);
+ jobConf.set("mapred.input.dir", inputPath.toString());
+
+ job.setMapperClass(mapper);
+ job.setMapOutputKeyClass(mapperKey);
+ job.setMapOutputValueClass(mapperValue);
+ job.setOutputKeyClass(mapperKey);
+ job.setOutputValueClass(mapperValue);
+ jobConf.setBoolean("mapred.compress.map.output", true);
+ job.setNumReduceTasks(0);
+
+ job.setOutputFormatClass(outputFormat);
+ jobConf.set("mapred.output.dir", outputPath.toString());
+
+ return job;
+ }
+
+ /**
+ * Create a map and reduce Hadoop job. Does not set the name on the job.
+ * @param inputPath The input {@link org.apache.hadoop.fs.Path}
+ * @param outputPath The output {@link org.apache.hadoop.fs.Path}
+ * @param inputFormat The {@link org.apache.hadoop.mapreduce.InputFormat}
+ * @param mapper The {@link org.apache.hadoop.mapreduce.Mapper} class to use
+ * @param mapperKey The {@link org.apache.hadoop.io.Writable} key class. If the Mapper is a no-op,
+ * this value may be null
+ * @param mapperValue The {@link org.apache.hadoop.io.Writable} value class. If the Mapper is a no-op,
+ * this value may be null
+ * @param reducer The {@link org.apache.hadoop.mapreduce.Reducer} to use
+ * @param reducerKey The reducer key class.
+ * @param reducerValue The reducer value class.
+ * @param outputFormat The {@link org.apache.hadoop.mapreduce.OutputFormat}.
+ * @param conf The {@link org.apache.hadoop.conf.Configuration} to use.
+ * @return The {@link org.apache.hadoop.mapreduce.Job}.
+ * @throws IOException if there is a problem with the IO.
+ *
+ * @see #getCustomJobName(String, org.apache.hadoop.mapreduce.JobContext, Class, Class)
+ * @see #prepareJob(org.apache.hadoop.fs.Path, org.apache.hadoop.fs.Path, Class, Class, Class, Class, Class,
+ * org.apache.hadoop.conf.Configuration)
+ */
+ public static Job prepareJob(Path inputPath,
+ Path outputPath,
+ Class<? extends InputFormat> inputFormat,
+ Class<? extends Mapper> mapper,
+ Class<? extends Writable> mapperKey,
+ Class<? extends Writable> mapperValue,
+ Class<? extends Reducer> reducer,
+ Class<? extends Writable> reducerKey,
+ Class<? extends Writable> reducerValue,
+ Class<? extends OutputFormat> outputFormat,
+ Configuration conf) throws IOException {
+
+ Job job = new Job(new Configuration(conf));
+ Configuration jobConf = job.getConfiguration();
+
+ if (reducer.equals(Reducer.class)) {
+ if (mapper.equals(Mapper.class)) {
+ throw new IllegalStateException("Can't figure out the user class jar file from mapper/reducer");
+ }
+ job.setJarByClass(mapper);
+ } else {
+ job.setJarByClass(reducer);
+ }
+
+ job.setInputFormatClass(inputFormat);
+ jobConf.set("mapred.input.dir", inputPath.toString());
+
+ job.setMapperClass(mapper);
+ if (mapperKey != null) {
+ job.setMapOutputKeyClass(mapperKey);
+ }
+ if (mapperValue != null) {
+ job.setMapOutputValueClass(mapperValue);
+ }
+
+ jobConf.setBoolean("mapred.compress.map.output", true);
+
+ job.setReducerClass(reducer);
+ job.setOutputKeyClass(reducerKey);
+ job.setOutputValueClass(reducerValue);
+
+ job.setOutputFormatClass(outputFormat);
+ jobConf.set("mapred.output.dir", outputPath.toString());
+
+ return job;
+ }
+
+
+ public static String getCustomJobName(String className, JobContext job,
+ Class<? extends Mapper> mapper,
+ Class<? extends Reducer> reducer) {
+ StringBuilder name = new StringBuilder(100);
+ String customJobName = job.getJobName();
+ if (customJobName == null || customJobName.trim().isEmpty()) {
+ name.append(className);
+ } else {
+ name.append(customJobName);
+ }
+ name.append('-').append(mapper.getSimpleName());
+ name.append('-').append(reducer.getSimpleName());
+ return name.toString();
+ }
+
+
+ public static void delete(Configuration conf, Iterable<Path> paths) throws IOException {
+ if (conf == null) {
+ conf = new Configuration();
+ }
+ for (Path path : paths) {
+ FileSystem fs = path.getFileSystem(conf);
+ if (fs.exists(path)) {
+ log.info("Deleting {}", path);
+ fs.delete(path, true);
+ }
+ }
+ }
+
+ public static void delete(Configuration conf, Path... paths) throws IOException {
+ delete(conf, Arrays.asList(paths));
+ }
+
+ public static long countRecords(Path path, Configuration conf) throws IOException {
+ long count = 0;
+ Iterator<?> iterator = new SequenceFileValueIterator<>(path, true, conf);
+ while (iterator.hasNext()) {
+ iterator.next();
+ count++;
+ }
+ return count;
+ }
+
+ /**
+ * Count all the records in a directory using a
+ * {@link org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterator}
+ *
+ * @param path The {@link org.apache.hadoop.fs.Path} to count
+ * @param pt The {@link org.apache.mahout.common.iterator.sequencefile.PathType}
+ * @param filter Apply the {@link org.apache.hadoop.fs.PathFilter}. May be null
+ * @param conf The Hadoop {@link org.apache.hadoop.conf.Configuration}
+ * @return The number of records
+ * @throws IOException if there was an IO error
+ */
+ public static long countRecords(Path path, PathType pt, PathFilter filter, Configuration conf) throws IOException {
+ long count = 0;
+ Iterator<?> iterator = new SequenceFileDirValueIterator<>(path, pt, filter, null, true, conf);
+ while (iterator.hasNext()) {
+ iterator.next();
+ count++;
+ }
+ return count;
+ }
+
+ public static InputStream openStream(Path path, Configuration conf) throws IOException {
+ FileSystem fs = FileSystem.get(path.toUri(), conf);
+ return fs.open(path.makeQualified(path.toUri(), path));
+ }
+
+ public static FileStatus[] getFileStatus(Path path, PathType pathType, PathFilter filter,
+ Comparator<FileStatus> ordering, Configuration conf) throws IOException {
+ FileStatus[] statuses;
+ FileSystem fs = path.getFileSystem(conf);
+ if (filter == null) {
+ statuses = pathType == PathType.GLOB ? fs.globStatus(path) : listStatus(fs, path);
+ } else {
+ statuses = pathType == PathType.GLOB ? fs.globStatus(path, filter) : listStatus(fs, path, filter);
+ }
+ if (ordering != null) {
+ Arrays.sort(statuses, ordering);
+ }
+ return statuses;
+ }
+
+ public static FileStatus[] listStatus(FileSystem fs, Path path) throws IOException {
+ try {
+ return fs.listStatus(path);
+ } catch (FileNotFoundException e) {
+ return new FileStatus[0];
+ }
+ }
+
+ public static FileStatus[] listStatus(FileSystem fs, Path path, PathFilter filter) throws IOException {
+ try {
+ return fs.listStatus(path, filter);
+ } catch (FileNotFoundException e) {
+ return new FileStatus[0];
+ }
+ }
+
+ public static void cacheFiles(Path fileToCache, Configuration conf) {
+ DistributedCache.setCacheFiles(new URI[]{fileToCache.toUri()}, conf);
+ }
+
+ /**
+ * Return the first cached file in the list, else null if thre are no cached files.
+ * @param conf - MapReduce Configuration
+ * @return Path of Cached file
+ * @throws IOException - IO Exception
+ */
+ public static Path getSingleCachedFile(Configuration conf) throws IOException {
+ return getCachedFiles(conf)[0];
+ }
+
+ /**
+ * Retrieves paths to cached files.
+ * @param conf - MapReduce Configuration
+ * @return Path[] of Cached Files
+ * @throws IOException - IO Exception
+ * @throws IllegalStateException if no cache files are found
+ */
+ public static Path[] getCachedFiles(Configuration conf) throws IOException {
+ LocalFileSystem localFs = FileSystem.getLocal(conf);
+ Path[] cacheFiles = DistributedCache.getLocalCacheFiles(conf);
+
+ URI[] fallbackFiles = DistributedCache.getCacheFiles(conf);
+
+ // fallback for local execution
+ if (cacheFiles == null) {
+
+ Preconditions.checkState(fallbackFiles != null, "Unable to find cached files!");
+
+ cacheFiles = new Path[fallbackFiles.length];
+ for (int n = 0; n < fallbackFiles.length; n++) {
+ cacheFiles[n] = new Path(fallbackFiles[n].getPath());
+ }
+ } else {
+
+ for (int n = 0; n < cacheFiles.length; n++) {
+ cacheFiles[n] = localFs.makeQualified(cacheFiles[n]);
+ // fallback for local execution
+ if (!localFs.exists(cacheFiles[n])) {
+ cacheFiles[n] = new Path(fallbackFiles[n].getPath());
+ }
+ }
+ }
+
+ Preconditions.checkState(cacheFiles.length > 0, "Unable to find cached files!");
+
+ return cacheFiles;
+ }
+
+ public static void setSerializations(Configuration configuration) {
+ configuration.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
+ + "org.apache.hadoop.io.serializer.WritableSerialization");
+ }
+
+ public static void writeInt(int value, Path path, Configuration configuration) throws IOException {
+ FileSystem fs = FileSystem.get(path.toUri(), configuration);
+ try (FSDataOutputStream out = fs.create(path)) {
+ out.writeInt(value);
+ }
+ }
+
+ public static int readInt(Path path, Configuration configuration) throws IOException {
+ FileSystem fs = FileSystem.get(path.toUri(), configuration);
+ try (FSDataInputStream in = fs.open(path)) {
+ return in.readInt();
+ }
+ }
+
+ /**
+ * Builds a comma-separated list of input splits
+ * @param fs - File System
+ * @param fileStatus - File Status
+ * @return list of directories as a comma-separated String
+ * @throws IOException - IO Exception
+ */
+ public static String buildDirList(FileSystem fs, FileStatus fileStatus) throws IOException {
+ boolean containsFiles = false;
+ List<String> directoriesList = new ArrayList<>();
+ for (FileStatus childFileStatus : fs.listStatus(fileStatus.getPath())) {
+ if (childFileStatus.isDir()) {
+ String subDirectoryList = buildDirList(fs, childFileStatus);
+ directoriesList.add(subDirectoryList);
+ } else {
+ containsFiles = true;
+ }
+ }
+
+ if (containsFiles) {
+ directoriesList.add(fileStatus.getPath().toUri().getPath());
+ }
+ return Joiner.on(',').skipNulls().join(directoriesList.iterator());
+ }
+
+ /**
+ * Builds a comma-separated list of input splits
+ * @param fs - File System
+ * @param fileStatus - File Status
+ * @param pathFilter - path filter
+ * @return list of directories as a comma-separated String
+ * @throws IOException - IO Exception
+ */
+ public static String buildDirList(FileSystem fs, FileStatus fileStatus, PathFilter pathFilter) throws IOException {
+ boolean containsFiles = false;
+ List<String> directoriesList = new ArrayList<>();
+ for (FileStatus childFileStatus : fs.listStatus(fileStatus.getPath(), pathFilter)) {
+ if (childFileStatus.isDir()) {
+ String subDirectoryList = buildDirList(fs, childFileStatus);
+ directoriesList.add(subDirectoryList);
+ } else {
+ containsFiles = true;
+ }
+ }
+
+ if (containsFiles) {
+ directoriesList.add(fileStatus.getPath().toUri().getPath());
+ }
+ return Joiner.on(',').skipNulls().join(directoriesList.iterator());
+ }
+
+ /**
+ *
+ * @param configuration - configuration
+ * @param filePath - Input File Path
+ * @return relative file Path
+ * @throws IOException - IO Exception
+ */
+ public static String calcRelativeFilePath(Configuration configuration, Path filePath) throws IOException {
+ FileSystem fs = filePath.getFileSystem(configuration);
+ FileStatus fst = fs.getFileStatus(filePath);
+ String currentPath = fst.getPath().toString().replaceFirst("file:", "");
+
+ String basePath = configuration.get("baseinputpath");
+ if (!basePath.endsWith("/")) {
+ basePath += "/";
+ }
+ basePath = basePath.replaceFirst("file:", "");
+ String[] parts = currentPath.split(basePath);
+
+ if (parts.length == 2) {
+ return parts[1];
+ } else if (parts.length == 1) {
+ return parts[0];
+ }
+ return currentPath;
+ }
+
+ /**
+ * Finds a file in the DistributedCache
+ *
+ * @param partOfFilename a substring of the file name
+ * @param localFiles holds references to files stored in distributed cache
+ * @return Path to first matched file or null if nothing was found
+ **/
+ public static Path findInCacheByPartOfFilename(String partOfFilename, URI[] localFiles) {
+ for (URI distCacheFile : localFiles) {
+ log.info("trying find a file in distributed cache containing [{}] in its name", partOfFilename);
+ if (distCacheFile != null && distCacheFile.toString().contains(partOfFilename)) {
+ log.info("found file [{}] containing [{}]", distCacheFile.toString(), partOfFilename);
+ return new Path(distCacheFile.getPath());
+ }
+ }
+ return null;
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/IntPairWritable.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/IntPairWritable.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/IntPairWritable.java
new file mode 100644
index 0000000..dacd66f
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/IntPairWritable.java
@@ -0,0 +1,270 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common;
+
+import org.apache.hadoop.io.BinaryComparable;
+import org.apache.hadoop.io.WritableComparable;
+import org.apache.hadoop.io.WritableComparator;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.io.Serializable;
+import java.util.Arrays;
+
+/**
+ * A {@link WritableComparable} which encapsulates an ordered pair of signed integers.
+ */
+public final class IntPairWritable extends BinaryComparable
+ implements WritableComparable<BinaryComparable>, Cloneable {
+
+ static final int INT_BYTE_LENGTH = 4;
+ static final int INT_PAIR_BYTE_LENGTH = 2 * INT_BYTE_LENGTH;
+ private byte[] b = new byte[INT_PAIR_BYTE_LENGTH];
+
+ public IntPairWritable() {
+ setFirst(0);
+ setSecond(0);
+ }
+
+ public IntPairWritable(IntPairWritable pair) {
+ b = Arrays.copyOf(pair.getBytes(), INT_PAIR_BYTE_LENGTH);
+ }
+
+ public IntPairWritable(int x, int y) {
+ putInt(x, b, 0);
+ putInt(y, b, INT_BYTE_LENGTH);
+ }
+
+ public void set(int x, int y) {
+ putInt(x, b, 0);
+ putInt(y, b, INT_BYTE_LENGTH);
+ }
+
+ public void setFirst(int x) {
+ putInt(x, b, 0);
+ }
+
+ public int getFirst() {
+ return getInt(b, 0);
+ }
+
+ public void setSecond(int y) {
+ putInt(y, b, INT_BYTE_LENGTH);
+ }
+
+ public int getSecond() {
+ return getInt(b, INT_BYTE_LENGTH);
+ }
+
+ @Override
+ public void readFields(DataInput in) throws IOException {
+ in.readFully(b);
+ }
+
+ @Override
+ public void write(DataOutput out) throws IOException {
+ out.write(b);
+ }
+
+ @Override
+ public int hashCode() {
+ return Arrays.hashCode(b);
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj) {
+ return true;
+ }
+ if (!super.equals(obj)) {
+ return false;
+ }
+ if (!(obj instanceof IntPairWritable)) {
+ return false;
+ }
+ IntPairWritable other = (IntPairWritable) obj;
+ return Arrays.equals(b, other.b);
+ }
+
+ @Override
+ public int compareTo(BinaryComparable other) {
+ return Comparator.doCompare(b, 0, ((IntPairWritable) other).b, 0);
+ }
+
+ @Override
+ public Object clone() {
+ return new IntPairWritable(this);
+ }
+
+ @Override
+ public String toString() {
+ return "(" + getFirst() + ", " + getSecond() + ')';
+ }
+
+ @Override
+ public byte[] getBytes() {
+ return b;
+ }
+
+ @Override
+ public int getLength() {
+ return INT_PAIR_BYTE_LENGTH;
+ }
+
+ private static void putInt(int value, byte[] b, int offset) {
+ for (int i = offset, j = 24; j >= 0; i++, j -= 8) {
+ b[i] = (byte) (value >> j);
+ }
+ }
+
+ private static int getInt(byte[] b, int offset) {
+ int value = 0;
+ for (int i = offset, j = 24; j >= 0; i++, j -= 8) {
+ value |= (b[i] & 0xFF) << j;
+ }
+ return value;
+ }
+
+ static {
+ WritableComparator.define(IntPairWritable.class, new Comparator());
+ }
+
+ public static final class Comparator extends WritableComparator implements Serializable {
+ public Comparator() {
+ super(IntPairWritable.class);
+ }
+
+ @Override
+ public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
+ return doCompare(b1, s1, b2, s2);
+ }
+
+ static int doCompare(byte[] b1, int s1, byte[] b2, int s2) {
+ int compare1 = compareInts(b1, s1, b2, s2);
+ if (compare1 != 0) {
+ return compare1;
+ }
+ return compareInts(b1, s1 + INT_BYTE_LENGTH, b2, s2 + INT_BYTE_LENGTH);
+ }
+
+ private static int compareInts(byte[] b1, int s1, byte[] b2, int s2) {
+ // Like WritableComparator.compareBytes(), but treats first byte as signed value
+ int end1 = s1 + INT_BYTE_LENGTH;
+ for (int i = s1, j = s2; i < end1; i++, j++) {
+ int a = b1[i];
+ int b = b2[j];
+ if (i > s1) {
+ a &= 0xff;
+ b &= 0xff;
+ }
+ if (a != b) {
+ return a - b;
+ }
+ }
+ return 0;
+ }
+ }
+
+ /**
+ * Compare only the first part of the pair, so that reduce is called once for each value of the first part.
+ */
+ public static class FirstGroupingComparator extends WritableComparator implements Serializable {
+
+ public FirstGroupingComparator() {
+ super(IntPairWritable.class);
+ }
+
+ @Override
+ public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
+ int firstb1 = WritableComparator.readInt(b1, s1);
+ int firstb2 = WritableComparator.readInt(b2, s2);
+ if (firstb1 < firstb2) {
+ return -1;
+ } else if (firstb1 > firstb2) {
+ return 1;
+ } else {
+ return 0;
+ }
+ }
+
+ @Override
+ public int compare(Object o1, Object o2) {
+ int firstb1 = ((IntPairWritable) o1).getFirst();
+ int firstb2 = ((IntPairWritable) o2).getFirst();
+ if (firstb1 < firstb2) {
+ return -1;
+ }
+ if (firstb1 > firstb2) {
+ return 1;
+ }
+ return 0;
+ }
+
+ }
+
+ /** A wrapper class that associates pairs with frequency (Occurrences) */
+ public static class Frequency implements Comparable<Frequency>, Serializable {
+
+ private final IntPairWritable pair;
+ private final double frequency;
+
+ public Frequency(IntPairWritable bigram, double frequency) {
+ this.pair = new IntPairWritable(bigram);
+ this.frequency = frequency;
+ }
+
+ public double getFrequency() {
+ return frequency;
+ }
+
+ public IntPairWritable getPair() {
+ return pair;
+ }
+
+ @Override
+ public int hashCode() {
+ return pair.hashCode() + RandomUtils.hashDouble(frequency);
+ }
+
+ @Override
+ public boolean equals(Object right) {
+ if (!(right instanceof Frequency)) {
+ return false;
+ }
+ Frequency that = (Frequency) right;
+ return pair.equals(that.pair) && frequency == that.frequency;
+ }
+
+ @Override
+ public int compareTo(Frequency that) {
+ if (frequency < that.frequency) {
+ return -1;
+ }
+ if (frequency > that.frequency) {
+ return 1;
+ }
+ return 0;
+ }
+
+ @Override
+ public String toString() {
+ return pair + "\t" + frequency;
+ }
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/IntegerTuple.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/IntegerTuple.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/IntegerTuple.java
new file mode 100644
index 0000000..f456d4d
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/IntegerTuple.java
@@ -0,0 +1,176 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.Collections;
+import java.util.List;
+
+import com.google.common.collect.Lists;
+import org.apache.hadoop.io.WritableComparable;
+
+/**
+ * An Ordered List of Integers which can be used in a Hadoop Map/Reduce Job
+ *
+ *
+ */
+public final class IntegerTuple implements WritableComparable<IntegerTuple> {
+
+ private List<Integer> tuple = Lists.newArrayList();
+
+ public IntegerTuple() { }
+
+ public IntegerTuple(Integer firstEntry) {
+ add(firstEntry);
+ }
+
+ public IntegerTuple(Iterable<Integer> entries) {
+ for (Integer entry : entries) {
+ add(entry);
+ }
+ }
+
+ public IntegerTuple(Integer[] entries) {
+ for (Integer entry : entries) {
+ add(entry);
+ }
+ }
+
+ /**
+ * add an entry to the end of the list
+ *
+ * @param entry
+ * @return true if the items get added
+ */
+ public boolean add(Integer entry) {
+ return tuple.add(entry);
+ }
+
+ /**
+ * Fetches the string at the given location
+ *
+ * @param index
+ * @return String value at the given location in the tuple list
+ */
+ public Integer integerAt(int index) {
+ return tuple.get(index);
+ }
+
+ /**
+ * Replaces the string at the given index with the given newString
+ *
+ * @param index
+ * @param newInteger
+ * @return The previous value at that location
+ */
+ public Integer replaceAt(int index, Integer newInteger) {
+ return tuple.set(index, newInteger);
+ }
+
+ /**
+ * Fetch the list of entries from the tuple
+ *
+ * @return a List containing the strings in the order of insertion
+ */
+ public List<Integer> getEntries() {
+ return Collections.unmodifiableList(this.tuple);
+ }
+
+ /**
+ * Returns the length of the tuple
+ *
+ * @return length
+ */
+ public int length() {
+ return this.tuple.size();
+ }
+
+ @Override
+ public String toString() {
+ return tuple.toString();
+ }
+
+ @Override
+ public int hashCode() {
+ return tuple.hashCode();
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj) {
+ return true;
+ }
+ if (obj == null) {
+ return false;
+ }
+ if (getClass() != obj.getClass()) {
+ return false;
+ }
+ IntegerTuple other = (IntegerTuple) obj;
+ if (tuple == null) {
+ if (other.tuple != null) {
+ return false;
+ }
+ } else if (!tuple.equals(other.tuple)) {
+ return false;
+ }
+ return true;
+ }
+
+ @Override
+ public void readFields(DataInput in) throws IOException {
+ int len = in.readInt();
+ tuple = Lists.newArrayListWithCapacity(len);
+ for (int i = 0; i < len; i++) {
+ int data = in.readInt();
+ tuple.add(data);
+ }
+ }
+
+ @Override
+ public void write(DataOutput out) throws IOException {
+ out.writeInt(tuple.size());
+ for (Integer entry : tuple) {
+ out.writeInt(entry);
+ }
+ }
+
+ @Override
+ public int compareTo(IntegerTuple otherTuple) {
+ int thisLength = length();
+ int otherLength = otherTuple.length();
+ int min = Math.min(thisLength, otherLength);
+ for (int i = 0; i < min; i++) {
+ int ret = this.tuple.get(i).compareTo(otherTuple.integerAt(i));
+ if (ret == 0) {
+ continue;
+ }
+ return ret;
+ }
+ if (thisLength < otherLength) {
+ return -1;
+ } else if (thisLength > otherLength) {
+ return 1;
+ } else {
+ return 0;
+ }
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/LongPair.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/LongPair.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/LongPair.java
new file mode 100644
index 0000000..5215e3a
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/LongPair.java
@@ -0,0 +1,80 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common;
+
+import java.io.Serializable;
+
+import com.google.common.primitives.Longs;
+
+/** A simple (ordered) pair of longs. */
+public final class LongPair implements Comparable<LongPair>, Serializable {
+
+ private final long first;
+ private final long second;
+
+ public LongPair(long first, long second) {
+ this.first = first;
+ this.second = second;
+ }
+
+ public long getFirst() {
+ return first;
+ }
+
+ public long getSecond() {
+ return second;
+ }
+
+ public LongPair swap() {
+ return new LongPair(second, first);
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (!(obj instanceof LongPair)) {
+ return false;
+ }
+ LongPair otherPair = (LongPair) obj;
+ return first == otherPair.getFirst() && second == otherPair.getSecond();
+ }
+
+ @Override
+ public int hashCode() {
+ int firstHash = Longs.hashCode(first);
+ // Flip top and bottom 16 bits; this makes the hash function probably different
+ // for (a,b) versus (b,a)
+ return (firstHash >>> 16 | firstHash << 16) ^ Longs.hashCode(second);
+ }
+
+ @Override
+ public String toString() {
+ return '(' + String.valueOf(first) + ',' + second + ')';
+ }
+
+ @Override
+ public int compareTo(LongPair o) {
+ if (first < o.getFirst()) {
+ return -1;
+ } else if (first > o.getFirst()) {
+ return 1;
+ } else {
+ return second < o.getSecond() ? -1 : second > o.getSecond() ? 1 : 0;
+ }
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/MemoryUtil.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/MemoryUtil.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/MemoryUtil.java
new file mode 100644
index 0000000..f241b53
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/MemoryUtil.java
@@ -0,0 +1,99 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.common;
+
+import java.util.concurrent.Executors;
+import java.util.concurrent.ScheduledExecutorService;
+import java.util.concurrent.ThreadFactory;
+import java.util.concurrent.TimeUnit;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Memory utilities.
+ */
+public final class MemoryUtil {
+
+ private static final Logger log = LoggerFactory.getLogger(MemoryUtil.class);
+
+ private MemoryUtil() {
+ }
+
+ /**
+ * Logs current heap memory statistics.
+ *
+ * @see Runtime
+ */
+ public static void logMemoryStatistics() {
+ Runtime runtime = Runtime.getRuntime();
+ long freeBytes = runtime.freeMemory();
+ long maxBytes = runtime.maxMemory();
+ long totalBytes = runtime.totalMemory();
+ long usedBytes = totalBytes - freeBytes;
+ log.info("Memory (bytes): {} used, {} heap, {} max", usedBytes, totalBytes,
+ maxBytes);
+ }
+
+ private static volatile ScheduledExecutorService scheduler;
+
+ /**
+ * Constructs and starts a memory logger thread.
+ *
+ * @param rateInMillis how often memory info should be logged.
+ */
+ public static void startMemoryLogger(long rateInMillis) {
+ stopMemoryLogger();
+ scheduler = Executors.newScheduledThreadPool(1, new ThreadFactory() {
+ private final ThreadFactory delegate = Executors.defaultThreadFactory();
+
+ @Override
+ public Thread newThread(Runnable r) {
+ Thread t = delegate.newThread(r);
+ t.setDaemon(true);
+ return t;
+ }
+ });
+ Runnable memoryLoogerRunnable = new Runnable() {
+ @Override
+ public void run() {
+ logMemoryStatistics();
+ }
+ };
+ scheduler.scheduleAtFixedRate(memoryLoogerRunnable, rateInMillis, rateInMillis,
+ TimeUnit.MILLISECONDS);
+ }
+
+ /**
+ * Constructs and starts a memory logger thread with a logging rate of 1000 milliseconds.
+ */
+ public static void startMemoryLogger() {
+ startMemoryLogger(1000);
+ }
+
+ /**
+ * Stops the memory logger, if any, started via {@link #startMemoryLogger(long)} or
+ * {@link #startMemoryLogger()}.
+ */
+ public static void stopMemoryLogger() {
+ if (scheduler != null) {
+ scheduler.shutdownNow();
+ scheduler = null;
+ }
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/Pair.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/Pair.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/Pair.java
new file mode 100644
index 0000000..d2ad6a1
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/Pair.java
@@ -0,0 +1,99 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common;
+
+import java.io.Serializable;
+
+/** A simple (ordered) pair of two objects. Elements may be null. */
+public final class Pair<A,B> implements Comparable<Pair<A,B>>, Serializable {
+
+ private final A first;
+ private final B second;
+
+ public Pair(A first, B second) {
+ this.first = first;
+ this.second = second;
+ }
+
+ public A getFirst() {
+ return first;
+ }
+
+ public B getSecond() {
+ return second;
+ }
+
+ public Pair<B, A> swap() {
+ return new Pair<>(second, first);
+ }
+
+ public static <A,B> Pair<A,B> of(A a, B b) {
+ return new Pair<>(a, b);
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (!(obj instanceof Pair<?, ?>)) {
+ return false;
+ }
+ Pair<?, ?> otherPair = (Pair<?, ?>) obj;
+ return isEqualOrNulls(first, otherPair.getFirst())
+ && isEqualOrNulls(second, otherPair.getSecond());
+ }
+
+ private static boolean isEqualOrNulls(Object obj1, Object obj2) {
+ return obj1 == null ? obj2 == null : obj1.equals(obj2);
+ }
+
+ @Override
+ public int hashCode() {
+ int firstHash = hashCodeNull(first);
+ // Flip top and bottom 16 bits; this makes the hash function probably different
+ // for (a,b) versus (b,a)
+ return (firstHash >>> 16 | firstHash << 16) ^ hashCodeNull(second);
+ }
+
+ private static int hashCodeNull(Object obj) {
+ return obj == null ? 0 : obj.hashCode();
+ }
+
+ @Override
+ public String toString() {
+ return '(' + String.valueOf(first) + ',' + second + ')';
+ }
+
+ /**
+ * Defines an ordering on pairs that sorts by first value's natural ordering, ascending,
+ * and then by second value's natural ordering.
+ *
+ * @throws ClassCastException if types are not actually {@link Comparable}
+ */
+ @Override
+ public int compareTo(Pair<A,B> other) {
+ Comparable<A> thisFirst = (Comparable<A>) first;
+ A thatFirst = other.getFirst();
+ int compare = thisFirst.compareTo(thatFirst);
+ if (compare != 0) {
+ return compare;
+ }
+ Comparable<B> thisSecond = (Comparable<B>) second;
+ B thatSecond = other.getSecond();
+ return thisSecond.compareTo(thatSecond);
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/Parameters.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/Parameters.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/Parameters.java
new file mode 100644
index 0000000..e74c534
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/Parameters.java
@@ -0,0 +1,98 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common;
+
+import java.io.IOException;
+import java.util.Map;
+
+import com.google.common.collect.Maps;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.DefaultStringifier;
+import org.apache.hadoop.util.GenericsUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class Parameters {
+
+ private static final Logger log = LoggerFactory.getLogger(Parameters.class);
+
+ private Map<String,String> params = Maps.newHashMap();
+
+ public Parameters() {
+
+ }
+
+ public Parameters(String serializedString) throws IOException {
+ this(parseParams(serializedString));
+ }
+
+ protected Parameters(Map<String,String> params) {
+ this.params = params;
+ }
+
+ public String get(String key) {
+ return params.get(key);
+ }
+
+ public String get(String key, String defaultValue) {
+ String ret = params.get(key);
+ return ret == null ? defaultValue : ret;
+ }
+
+ public void set(String key, String value) {
+ params.put(key, value);
+ }
+
+ public int getInt(String key, int defaultValue) {
+ String ret = params.get(key);
+ return ret == null ? defaultValue : Integer.parseInt(ret);
+ }
+
+ @Override
+ public String toString() {
+ Configuration conf = new Configuration();
+ conf.set("io.serializations",
+ "org.apache.hadoop.io.serializer.JavaSerialization,"
+ + "org.apache.hadoop.io.serializer.WritableSerialization");
+ DefaultStringifier<Map<String,String>> mapStringifier = new DefaultStringifier<>(conf,
+ GenericsUtil.getClass(params));
+ try {
+ return mapStringifier.toString(params);
+ } catch (IOException e) {
+ log.info("Encountered IOException while deserializing returning empty string", e);
+ return "";
+ }
+
+ }
+
+ public String print() {
+ return params.toString();
+ }
+
+ public static Map<String,String> parseParams(String serializedString) throws IOException {
+ Configuration conf = new Configuration();
+ conf.set("io.serializations",
+ "org.apache.hadoop.io.serializer.JavaSerialization,"
+ + "org.apache.hadoop.io.serializer.WritableSerialization");
+ Map<String,String> params = Maps.newHashMap();
+ DefaultStringifier<Map<String,String>> mapStringifier = new DefaultStringifier<>(conf,
+ GenericsUtil.getClass(params));
+ return mapStringifier.fromString(serializedString);
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/StringTuple.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/StringTuple.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/StringTuple.java
new file mode 100644
index 0000000..0de1a4a
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/StringTuple.java
@@ -0,0 +1,177 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.Collections;
+import java.util.List;
+
+import com.google.common.collect.Lists;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.WritableComparable;
+
+/**
+ * An Ordered List of Strings which can be used in a Hadoop Map/Reduce Job
+ */
+public final class StringTuple implements WritableComparable<StringTuple> {
+
+ private List<String> tuple = Lists.newArrayList();
+
+ public StringTuple() { }
+
+ public StringTuple(String firstEntry) {
+ add(firstEntry);
+ }
+
+ public StringTuple(Iterable<String> entries) {
+ for (String entry : entries) {
+ add(entry);
+ }
+ }
+
+ public StringTuple(String[] entries) {
+ for (String entry : entries) {
+ add(entry);
+ }
+ }
+
+ /**
+ * add an entry to the end of the list
+ *
+ * @param entry
+ * @return true if the items get added
+ */
+ public boolean add(String entry) {
+ return tuple.add(entry);
+ }
+
+ /**
+ * Fetches the string at the given location
+ *
+ * @param index
+ * @return String value at the given location in the tuple list
+ */
+ public String stringAt(int index) {
+ return tuple.get(index);
+ }
+
+ /**
+ * Replaces the string at the given index with the given newString
+ *
+ * @param index
+ * @param newString
+ * @return The previous value at that location
+ */
+ public String replaceAt(int index, String newString) {
+ return tuple.set(index, newString);
+ }
+
+ /**
+ * Fetch the list of entries from the tuple
+ *
+ * @return a List containing the strings in the order of insertion
+ */
+ public List<String> getEntries() {
+ return Collections.unmodifiableList(this.tuple);
+ }
+
+ /**
+ * Returns the length of the tuple
+ *
+ * @return length
+ */
+ public int length() {
+ return this.tuple.size();
+ }
+
+ @Override
+ public String toString() {
+ return tuple.toString();
+ }
+
+ @Override
+ public int hashCode() {
+ return tuple.hashCode();
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj) {
+ return true;
+ }
+ if (obj == null) {
+ return false;
+ }
+ if (getClass() != obj.getClass()) {
+ return false;
+ }
+ StringTuple other = (StringTuple) obj;
+ if (tuple == null) {
+ if (other.tuple != null) {
+ return false;
+ }
+ } else if (!tuple.equals(other.tuple)) {
+ return false;
+ }
+ return true;
+ }
+
+ @Override
+ public void readFields(DataInput in) throws IOException {
+ int len = in.readInt();
+ tuple = Lists.newArrayListWithCapacity(len);
+ Text value = new Text();
+ for (int i = 0; i < len; i++) {
+ value.readFields(in);
+ tuple.add(value.toString());
+ }
+ }
+
+ @Override
+ public void write(DataOutput out) throws IOException {
+ out.writeInt(tuple.size());
+ Text value = new Text();
+ for (String entry : tuple) {
+ value.set(entry);
+ value.write(out);
+ }
+ }
+
+ @Override
+ public int compareTo(StringTuple otherTuple) {
+ int thisLength = length();
+ int otherLength = otherTuple.length();
+ int min = Math.min(thisLength, otherLength);
+ for (int i = 0; i < min; i++) {
+ int ret = this.tuple.get(i).compareTo(otherTuple.stringAt(i));
+ if (ret != 0) {
+ return ret;
+ }
+ }
+ if (thisLength < otherLength) {
+ return -1;
+ } else if (thisLength > otherLength) {
+ return 1;
+ } else {
+ return 0;
+ }
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/StringUtils.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/StringUtils.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/StringUtils.java
new file mode 100644
index 0000000..a064596
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/StringUtils.java
@@ -0,0 +1,63 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common;
+
+import java.util.regex.Pattern;
+
+import com.thoughtworks.xstream.XStream;
+
+/**
+ * Offers two methods to convert an object to a string representation and restore the object given its string
+ * representation. Should use Hadoop Stringifier whenever available.
+ */
+public final class StringUtils {
+
+ private static final XStream XSTREAM = new XStream();
+ private static final Pattern NEWLINE_PATTERN = Pattern.compile("\n");
+ private static final Pattern XMLRESERVED = Pattern.compile("\"|\\&|\\<|\\>|\'");
+
+ private StringUtils() {
+ // do nothing
+ }
+
+ /**
+ * Converts the object to a one-line string representation
+ *
+ * @param obj
+ * the object to convert
+ * @return the string representation of the object
+ */
+ public static String toString(Object obj) {
+ return NEWLINE_PATTERN.matcher(XSTREAM.toXML(obj)).replaceAll("");
+ }
+
+ /**
+ * Restores the object from its string representation.
+ *
+ * @param str
+ * the string representation of the object
+ * @return restored object
+ */
+ public static <T> T fromString(String str) {
+ return (T) XSTREAM.fromXML(str);
+ }
+
+ public static String escapeXML(CharSequence input) {
+ return XMLRESERVED.matcher(input).replaceAll("_");
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/TimingStatistics.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/TimingStatistics.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/TimingStatistics.java
new file mode 100644
index 0000000..5ee2066
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/TimingStatistics.java
@@ -0,0 +1,154 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common;
+
+import java.io.Serializable;
+import java.text.DecimalFormat;
+
+public final class TimingStatistics implements Serializable {
+ private static final DecimalFormat DF = new DecimalFormat("#.##");
+ private int nCalls;
+ private long minTime;
+ private long maxTime;
+ private long sumTime;
+ private long leadSumTime;
+ private double sumSquaredTime;
+
+
+ /** Creates a new instance of CallStats */
+ public TimingStatistics() { }
+
+ public TimingStatistics(int nCalls, long minTime, long maxTime, long sumTime, double sumSquaredTime) {
+ this.nCalls = nCalls;
+ this.minTime = minTime;
+ this.maxTime = maxTime;
+ this.sumTime = sumTime;
+ this.sumSquaredTime = sumSquaredTime;
+ }
+
+ public synchronized int getNCalls() {
+ return nCalls;
+ }
+
+ public synchronized long getMinTime() {
+ return Math.max(0, minTime);
+ }
+
+ public synchronized long getMaxTime() {
+ return maxTime;
+ }
+
+ public synchronized long getSumTime() {
+ return sumTime;
+ }
+
+ public synchronized double getSumSquaredTime() {
+ return sumSquaredTime;
+ }
+
+ public synchronized long getMeanTime() {
+ return nCalls == 0 ? 0 : sumTime / nCalls;
+ }
+
+ public synchronized long getStdDevTime() {
+ if (nCalls == 0) {
+ return 0;
+ }
+ double mean = getMeanTime();
+ double meanSquared = mean * mean;
+ double meanOfSquares = sumSquaredTime / nCalls;
+ double variance = meanOfSquares - meanSquared;
+ if (variance < 0) {
+ return 0; // might happen due to rounding error
+ }
+ return (long) Math.sqrt(variance);
+ }
+
+ @Override
+ public synchronized String toString() {
+ return '\n'
+ + "nCalls = " + nCalls + ";\n"
+ + "sum = " + DF.format(sumTime / 1000000000.0) + "s;\n"
+ + "min = " + DF.format(minTime / 1000000.0) + "ms;\n"
+ + "max = " + DF.format(maxTime / 1000000.0) + "ms;\n"
+ + "mean = " + DF.format(getMeanTime() / 1000.0) + "us;\n"
+ + "stdDev = " + DF.format(getStdDevTime() / 1000.0) + "us;";
+ }
+
+ /** Ignores counting the performance metrics until leadTimeIsFinished The caller should enough time for the JIT
+ * to warm up. */
+ public Call newCall(long leadTimeUsec) {
+ if (leadSumTime > leadTimeUsec) {
+ return new Call();
+ } else {
+ return new LeadTimeCall();
+ }
+ }
+
+ /** Ignores counting the performance metrics. The caller should enough time for the JIT to warm up. */
+ public final class LeadTimeCall extends Call {
+
+ private LeadTimeCall() { }
+
+ @Override
+ public void end() {
+ long elapsed = System.nanoTime() - startTime;
+ synchronized (TimingStatistics.this) {
+ leadSumTime += elapsed;
+ }
+ }
+
+ @Override
+ public boolean end(long sumMaxUsec) {
+ end();
+ return false;
+ }
+ }
+
+ /**
+ * A call object that can update performance metrics.
+ */
+ public class Call {
+ protected final long startTime = System.nanoTime();
+
+ private Call() { }
+
+ public void end() {
+ long elapsed = System.nanoTime() - startTime;
+ synchronized (TimingStatistics.this) {
+ nCalls++;
+ if (elapsed < minTime || nCalls == 1) {
+ minTime = elapsed;
+ }
+ if (elapsed > maxTime) {
+ maxTime = elapsed;
+ }
+ sumTime += elapsed;
+ sumSquaredTime += elapsed * elapsed;
+ }
+ }
+
+ /**
+ * Returns true if the sumTime as reached this limit;
+ */
+ public boolean end(long sumMaxUsec) {
+ end();
+ return sumMaxUsec < sumTime;
+ }
+ }
+}
r***@apache.org
2018-06-28 14:54:32 UTC
Permalink
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/mapreduce/StreamingKMeansDriver.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/mapreduce/StreamingKMeansDriver.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/mapreduce/StreamingKMeansDriver.java
new file mode 100644
index 0000000..0f6f7f2
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/mapreduce/StreamingKMeansDriver.java
@@ -0,0 +1,493 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.streaming.mapreduce;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+
+import com.google.common.base.Preconditions;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
+import org.apache.mahout.math.Centroid;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.neighborhood.BruteSearch;
+import org.apache.mahout.math.neighborhood.ProjectionSearch;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Classifies the vectors into different clusters found by the clustering
+ * algorithm.
+ */
+public final class StreamingKMeansDriver extends AbstractJob {
+ /**
+ * Streaming KMeans options
+ */
+ /**
+ * The number of cluster that Mappers will use should be \(O(k log n)\) where k is the number of clusters
+ * to get at the end and n is the number of points to cluster. This doesn't need to be exact.
+ * It will be adjusted at runtime.
+ */
+ public static final String ESTIMATED_NUM_MAP_CLUSTERS = "estimatedNumMapClusters";
+ /**
+ * The initial estimated distance cutoff between two points for forming new clusters.
+ * @see org.apache.mahout.clustering.streaming.cluster.StreamingKMeans
+ * Defaults to 10e-6.
+ */
+ public static final String ESTIMATED_DISTANCE_CUTOFF = "estimatedDistanceCutoff";
+
+ /**
+ * Ball KMeans options
+ */
+ /**
+ * After mapping finishes, we get an intermediate set of vectors that represent approximate
+ * clusterings of the data from each Mapper. These can be clustered by the Reducer using
+ * BallKMeans in memory. This variable is the maximum number of iterations in the final
+ * BallKMeans algorithm.
+ * Defaults to 10.
+ */
+ public static final String MAX_NUM_ITERATIONS = "maxNumIterations";
+ /**
+ * The "ball" aspect of ball k-means means that only the closest points to the centroid will actually be used
+ * for updating. The fraction of the points to be used is those points whose distance to the center is within
+ * trimFraction * distance to the closest other center.
+ * Defaults to 0.9.
+ */
+ public static final String TRIM_FRACTION = "trimFraction";
+ /**
+ * Whether to use k-means++ initialization or random initialization of the seed centroids.
+ * Essentially, k-means++ provides better clusters, but takes longer, whereas random initialization takes less
+ * time, but produces worse clusters, and tends to fail more often and needs multiple runs to compare to
+ * k-means++. If set, uses randomInit.
+ * @see org.apache.mahout.clustering.streaming.cluster.BallKMeans
+ */
+ public static final String RANDOM_INIT = "randomInit";
+ /**
+ * Whether to correct the weights of the centroids after the clustering is done. The weights end up being wrong
+ * because of the trimFraction and possible train/test splits. In some cases, especially in a pipeline, having
+ * an accurate count of the weights is useful. If set, ignores the final weights.
+ */
+ public static final String IGNORE_WEIGHTS = "ignoreWeights";
+ /**
+ * The percentage of points that go into the "test" set when evaluating BallKMeans runs in the reducer.
+ */
+ public static final String TEST_PROBABILITY = "testProbability";
+ /**
+ * The percentage of points that go into the "training" set when evaluating BallKMeans runs in the reducer.
+ */
+ public static final String NUM_BALLKMEANS_RUNS = "numBallKMeansRuns";
+
+ /**
+ Searcher options
+ */
+ /**
+ * The Searcher class when performing nearest neighbor search in StreamingKMeans.
+ * Defaults to ProjectionSearch.
+ */
+ public static final String SEARCHER_CLASS_OPTION = "searcherClass";
+ /**
+ * The number of projections to use when using a projection searcher like ProjectionSearch or
+ * FastProjectionSearch. Projection searches work by projection the all the vectors on to a set of
+ * basis vectors and searching for the projected query in that totally ordered set. This
+ * however can produce false positives (vectors that are closer when projected than they would
+ * actually be.
+ * So, there must be more than one projection vectors in the basis. This variable is the number
+ * of vectors in a basis.
+ * Defaults to 3
+ */
+ public static final String NUM_PROJECTIONS_OPTION = "numProjections";
+ /**
+ * When using approximate searches (anything that's not BruteSearch),
+ * more than just the seemingly closest element must be considered. This variable has different
+ * meanings depending on the actual Searcher class used but is a measure of how many candidates
+ * will be considered.
+ * See the ProjectionSearch, FastProjectionSearch, LocalitySensitiveHashSearch classes for more
+ * details.
+ * Defaults to 2.
+ */
+ public static final String SEARCH_SIZE_OPTION = "searchSize";
+
+ /**
+ * Whether to run another pass of StreamingKMeans on the reducer's points before BallKMeans. On some data sets
+ * with a large number of mappers, the intermediate number of clusters passed to the reducer is too large to
+ * fit into memory directly, hence the option to collapse the clusters further with StreamingKMeans.
+ */
+ public static final String REDUCE_STREAMING_KMEANS = "reduceStreamingKMeans";
+
+ private static final Logger log = LoggerFactory.getLogger(StreamingKMeansDriver.class);
+
+ public static final float INVALID_DISTANCE_CUTOFF = -1;
+
+ @Override
+ public int run(String[] args) throws Exception {
+ // Standard options for any Mahout job.
+ addInputOption();
+ addOutputOption();
+ addOption(DefaultOptionCreator.overwriteOption().create());
+
+ // The number of clusters to create for the data.
+ addOption(DefaultOptionCreator.numClustersOption().withDescription(
+ "The k in k-Means. Approximately this many clusters will be generated.").create());
+
+ // StreamingKMeans (mapper) options
+ // There will be k final clusters, but in the Map phase to get a good approximation of the data, O(k log n)
+ // clusters are needed. Since n is the number of data points and not knowable until reading all the vectors,
+ // provide a decent estimate.
+ addOption(ESTIMATED_NUM_MAP_CLUSTERS, "km", "The estimated number of clusters to use for the "
+ + "Map phase of the job when running StreamingKMeans. This should be around k * log(n), "
+ + "where k is the final number of clusters and n is the total number of data points to "
+ + "cluster.", String.valueOf(1));
+
+ addOption(ESTIMATED_DISTANCE_CUTOFF, "e", "The initial estimated distance cutoff between two "
+ + "points for forming new clusters. If no value is given, it's estimated from the data set",
+ String.valueOf(INVALID_DISTANCE_CUTOFF));
+
+ // BallKMeans (reducer) options
+ addOption(MAX_NUM_ITERATIONS, "mi", "The maximum number of iterations to run for the "
+ + "BallKMeans algorithm used by the reducer. If no value is given, defaults to 10.", String.valueOf(10));
+
+ addOption(TRIM_FRACTION, "tf", "The 'ball' aspect of ball k-means means that only the closest points "
+ + "to the centroid will actually be used for updating. The fraction of the points to be used is those "
+ + "points whose distance to the center is within trimFraction * distance to the closest other center. "
+ + "If no value is given, defaults to 0.9.", String.valueOf(0.9));
+
+ addFlag(RANDOM_INIT, "ri", "Whether to use k-means++ initialization or random initialization "
+ + "of the seed centroids. Essentially, k-means++ provides better clusters, but takes longer, whereas random "
+ + "initialization takes less time, but produces worse clusters, and tends to fail more often and needs "
+ + "multiple runs to compare to k-means++. If set, uses the random initialization.");
+
+ addFlag(IGNORE_WEIGHTS, "iw", "Whether to correct the weights of the centroids after the clustering is done. "
+ + "The weights end up being wrong because of the trimFraction and possible train/test splits. In some cases, "
+ + "especially in a pipeline, having an accurate count of the weights is useful. If set, ignores the final "
+ + "weights");
+
+ addOption(TEST_PROBABILITY, "testp", "A double value between 0 and 1 that represents the percentage of "
+ + "points to be used for 'testing' different clustering runs in the final BallKMeans "
+ + "step. If no value is given, defaults to 0.1", String.valueOf(0.1));
+
+ addOption(NUM_BALLKMEANS_RUNS, "nbkm", "Number of BallKMeans runs to use at the end to try to cluster the "
+ + "points. If no value is given, defaults to 4", String.valueOf(4));
+
+ // Nearest neighbor search options
+ // The distance measure used for computing the distance between two points. Generally, the
+ // SquaredEuclideanDistance is used for clustering problems (it's equivalent to CosineDistance for normalized
+ // vectors).
+ // WARNING! You can use any metric but most of the literature is for the squared euclidean distance.
+ addOption(DefaultOptionCreator.distanceMeasureOption().create());
+
+ // The default searcher should be something more efficient that BruteSearch (ProjectionSearch, ...). See
+ // o.a.m.math.neighborhood.*
+ addOption(SEARCHER_CLASS_OPTION, "sc", "The type of searcher to be used when performing nearest "
+ + "neighbor searches. Defaults to ProjectionSearch.", ProjectionSearch.class.getCanonicalName());
+
+ // In the original paper, the authors used 1 projection vector.
+ addOption(NUM_PROJECTIONS_OPTION, "np", "The number of projections considered in estimating the "
+ + "distances between vectors. Only used when the distance measure requested is either "
+ + "ProjectionSearch or FastProjectionSearch. If no value is given, defaults to 3.", String.valueOf(3));
+
+ addOption(SEARCH_SIZE_OPTION, "s", "In more efficient searches (non BruteSearch), "
+ + "not all distances are calculated for determining the nearest neighbors. The number of "
+ + "elements whose distances from the query vector is actually computer is proportional to "
+ + "searchSize. If no value is given, defaults to 1.", String.valueOf(2));
+
+ addFlag(REDUCE_STREAMING_KMEANS, "rskm", "There might be too many intermediate clusters from the mapper "
+ + "to fit into memory, so the reducer can run another pass of StreamingKMeans to collapse them down to a "
+ + "fewer clusters");
+
+ addOption(DefaultOptionCreator.methodOption().create());
+
+ if (parseArguments(args) == null) {
+ return -1;
+ }
+ Path output = getOutputPath();
+ if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
+ HadoopUtil.delete(getConf(), output);
+ }
+ configureOptionsForWorkers();
+ run(getConf(), getInputPath(), output);
+ return 0;
+ }
+
+ private void configureOptionsForWorkers() throws ClassNotFoundException {
+ log.info("Starting to configure options for workers");
+
+ String method = getOption(DefaultOptionCreator.METHOD_OPTION);
+
+ int numClusters = Integer.parseInt(getOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION));
+
+ // StreamingKMeans
+ int estimatedNumMapClusters = Integer.parseInt(getOption(ESTIMATED_NUM_MAP_CLUSTERS));
+ float estimatedDistanceCutoff = Float.parseFloat(getOption(ESTIMATED_DISTANCE_CUTOFF));
+
+ // BallKMeans
+ int maxNumIterations = Integer.parseInt(getOption(MAX_NUM_ITERATIONS));
+ float trimFraction = Float.parseFloat(getOption(TRIM_FRACTION));
+ boolean randomInit = hasOption(RANDOM_INIT);
+ boolean ignoreWeights = hasOption(IGNORE_WEIGHTS);
+ float testProbability = Float.parseFloat(getOption(TEST_PROBABILITY));
+ int numBallKMeansRuns = Integer.parseInt(getOption(NUM_BALLKMEANS_RUNS));
+
+ // Nearest neighbor search
+ String measureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION);
+ String searcherClass = getOption(SEARCHER_CLASS_OPTION);
+
+ // Get more parameters depending on the kind of search class we're working with. BruteSearch
+ // doesn't need anything else.
+ // LocalitySensitiveHashSearch and ProjectionSearches need searchSize.
+ // ProjectionSearches also need the number of projections.
+ boolean getSearchSize = false;
+ boolean getNumProjections = false;
+ if (!searcherClass.equals(BruteSearch.class.getName())) {
+ getSearchSize = true;
+ getNumProjections = true;
+ }
+
+ // The search size to use. This is quite fuzzy and might end up not being configurable at all.
+ int searchSize = 0;
+ if (getSearchSize) {
+ searchSize = Integer.parseInt(getOption(SEARCH_SIZE_OPTION));
+ }
+
+ // The number of projections to use. This is only useful in projection searches which
+ // project the vectors on multiple basis vectors to get distance estimates that are faster to
+ // calculate.
+ int numProjections = 0;
+ if (getNumProjections) {
+ numProjections = Integer.parseInt(getOption(NUM_PROJECTIONS_OPTION));
+ }
+
+ boolean reduceStreamingKMeans = hasOption(REDUCE_STREAMING_KMEANS);
+
+ configureOptionsForWorkers(getConf(), numClusters,
+ /* StreamingKMeans */
+ estimatedNumMapClusters, estimatedDistanceCutoff,
+ /* BallKMeans */
+ maxNumIterations, trimFraction, randomInit, ignoreWeights, testProbability, numBallKMeansRuns,
+ /* Searcher */
+ measureClass, searcherClass, searchSize, numProjections,
+ method,
+ reduceStreamingKMeans);
+ }
+
+ /**
+ * Checks the parameters for a StreamingKMeans job and prepares a Configuration with them.
+ *
+ * @param conf the Configuration to populate
+ * @param numClusters k, the number of clusters at the end
+ * @param estimatedNumMapClusters O(k log n), the number of clusters requested from each mapper
+ * @param estimatedDistanceCutoff an estimate of the minimum distance that separates two clusters (can be smaller and
+ * will be increased dynamically)
+ * @param maxNumIterations the maximum number of iterations of BallKMeans
+ * @param trimFraction the fraction of the points to be considered in updating a ball k-means
+ * @param randomInit whether to initialize the ball k-means seeds randomly
+ * @param ignoreWeights whether to ignore the invalid final ball k-means weights
+ * @param testProbability the percentage of vectors assigned to the test set for selecting the best final centers
+ * @param numBallKMeansRuns the number of BallKMeans runs in the reducer that determine the centroids to return
+ * (clusters are computed for the training set and the error is computed on the test set)
+ * @param measureClass string, name of the distance measure class; theory works for Euclidean-like distances
+ * @param searcherClass string, name of the searcher that will be used for nearest neighbor search
+ * @param searchSize the number of closest neighbors to look at for selecting the closest one in approximate nearest
+ * neighbor searches
+ * @param numProjections the number of projected vectors to use for faster searching (only useful for ProjectionSearch
+ * or FastProjectionSearch); @see org.apache.mahout.math.neighborhood.ProjectionSearch
+ */
+ public static void configureOptionsForWorkers(Configuration conf,
+ int numClusters,
+ /* StreamingKMeans */
+ int estimatedNumMapClusters, float estimatedDistanceCutoff,
+ /* BallKMeans */
+ int maxNumIterations, float trimFraction, boolean randomInit,
+ boolean ignoreWeights, float testProbability, int numBallKMeansRuns,
+ /* Searcher */
+ String measureClass, String searcherClass,
+ int searchSize, int numProjections,
+ String method,
+ boolean reduceStreamingKMeans) throws ClassNotFoundException {
+ // Checking preconditions for the parameters.
+ Preconditions.checkArgument(numClusters > 0,
+ "Invalid number of clusters requested: " + numClusters + ". Must be: numClusters > 0!");
+
+ // StreamingKMeans
+ Preconditions.checkArgument(estimatedNumMapClusters > numClusters, "Invalid number of estimated map "
+ + "clusters; There must be more than the final number of clusters (k log n vs k)");
+ Preconditions.checkArgument(estimatedDistanceCutoff == INVALID_DISTANCE_CUTOFF || estimatedDistanceCutoff > 0,
+ "estimatedDistanceCutoff must be equal to -1 or must be greater then 0!");
+
+ // BallKMeans
+ Preconditions.checkArgument(maxNumIterations > 0, "Must have at least one BallKMeans iteration");
+ Preconditions.checkArgument(trimFraction > 0, "trimFraction must be positive");
+ Preconditions.checkArgument(testProbability >= 0 && testProbability < 1, "test probability is not in the "
+ + "interval [0, 1)");
+ Preconditions.checkArgument(numBallKMeansRuns > 0, "numBallKMeans cannot be negative");
+
+ // Searcher
+ if (!searcherClass.contains("Brute")) {
+ // These tests only make sense when a relevant searcher is being used.
+ Preconditions.checkArgument(searchSize > 0, "Invalid searchSize. Must be positive.");
+ if (searcherClass.contains("Projection")) {
+ Preconditions.checkArgument(numProjections > 0, "Invalid numProjections. Must be positive");
+ }
+ }
+
+ // Setting the parameters in the Configuration.
+ conf.setInt(DefaultOptionCreator.NUM_CLUSTERS_OPTION, numClusters);
+ /* StreamingKMeans */
+ conf.setInt(ESTIMATED_NUM_MAP_CLUSTERS, estimatedNumMapClusters);
+ if (estimatedDistanceCutoff != INVALID_DISTANCE_CUTOFF) {
+ conf.setFloat(ESTIMATED_DISTANCE_CUTOFF, estimatedDistanceCutoff);
+ }
+ /* BallKMeans */
+ conf.setInt(MAX_NUM_ITERATIONS, maxNumIterations);
+ conf.setFloat(TRIM_FRACTION, trimFraction);
+ conf.setBoolean(RANDOM_INIT, randomInit);
+ conf.setBoolean(IGNORE_WEIGHTS, ignoreWeights);
+ conf.setFloat(TEST_PROBABILITY, testProbability);
+ conf.setInt(NUM_BALLKMEANS_RUNS, numBallKMeansRuns);
+ /* Searcher */
+ // Checks if the measureClass is available, throws exception otherwise.
+ Class.forName(measureClass);
+ conf.set(DefaultOptionCreator.DISTANCE_MEASURE_OPTION, measureClass);
+ // Checks if the searcherClass is available, throws exception otherwise.
+ Class.forName(searcherClass);
+ conf.set(SEARCHER_CLASS_OPTION, searcherClass);
+ conf.setInt(SEARCH_SIZE_OPTION, searchSize);
+ conf.setInt(NUM_PROJECTIONS_OPTION, numProjections);
+ conf.set(DefaultOptionCreator.METHOD_OPTION, method);
+
+ conf.setBoolean(REDUCE_STREAMING_KMEANS, reduceStreamingKMeans);
+
+ log.info("Parameters are: [k] numClusters {}; "
+ + "[SKM] estimatedNumMapClusters {}; estimatedDistanceCutoff {} "
+ + "[BKM] maxNumIterations {}; trimFraction {}; randomInit {}; ignoreWeights {}; "
+ + "testProbability {}; numBallKMeansRuns {}; "
+ + "[S] measureClass {}; searcherClass {}; searcherSize {}; numProjections {}; "
+ + "method {}; reduceStreamingKMeans {}", numClusters, estimatedNumMapClusters, estimatedDistanceCutoff,
+ maxNumIterations, trimFraction, randomInit, ignoreWeights, testProbability, numBallKMeansRuns,
+ measureClass, searcherClass, searchSize, numProjections, method, reduceStreamingKMeans);
+ }
+
+ /**
+ * Iterate over the input vectors to produce clusters and, if requested, use the results of the final iteration to
+ * cluster the input vectors.
+ *
+ * @param input the directory pathname for input points.
+ * @param output the directory pathname for output points.
+ * @return 0 on success, -1 on failure.
+ */
+ public static int run(Configuration conf, Path input, Path output)
+ throws IOException, InterruptedException, ClassNotFoundException, ExecutionException {
+ log.info("Starting StreamingKMeans clustering for vectors in {}; results are output to {}",
+ input.toString(), output.toString());
+
+ if (conf.get(DefaultOptionCreator.METHOD_OPTION,
+ DefaultOptionCreator.MAPREDUCE_METHOD).equals(DefaultOptionCreator.SEQUENTIAL_METHOD)) {
+ return runSequentially(conf, input, output);
+ } else {
+ return runMapReduce(conf, input, output);
+ }
+ }
+
+ private static int runSequentially(Configuration conf, Path input, Path output)
+ throws IOException, ExecutionException, InterruptedException {
+ long start = System.currentTimeMillis();
+ // Run StreamingKMeans step in parallel by spawning 1 thread per input path to process.
+ ExecutorService pool = Executors.newCachedThreadPool();
+ List<Future<Iterable<Centroid>>> intermediateCentroidFutures = new ArrayList<>();
+ for (FileStatus status : HadoopUtil.listStatus(FileSystem.get(conf), input, PathFilters.logsCRCFilter())) {
+ intermediateCentroidFutures.add(pool.submit(new StreamingKMeansThread(status.getPath(), conf)));
+ }
+ log.info("Finished running Mappers");
+ // Merge the resulting "mapper" centroids.
+ List<Centroid> intermediateCentroids = new ArrayList<>();
+ for (Future<Iterable<Centroid>> futureIterable : intermediateCentroidFutures) {
+ for (Centroid centroid : futureIterable.get()) {
+ intermediateCentroids.add(centroid);
+ }
+ }
+ pool.shutdown();
+ pool.awaitTermination(Long.MAX_VALUE, TimeUnit.SECONDS);
+ log.info("Finished StreamingKMeans");
+ SequenceFile.Writer writer = SequenceFile.createWriter(FileSystem.get(conf), conf, new Path(output, "part-r-00000"), IntWritable.class,
+ CentroidWritable.class);
+ int numCentroids = 0;
+ // Run BallKMeans on the intermediate centroids.
+ for (Vector finalVector : StreamingKMeansReducer.getBestCentroids(intermediateCentroids, conf)) {
+ Centroid finalCentroid = (Centroid)finalVector;
+ writer.append(new IntWritable(numCentroids++), new CentroidWritable(finalCentroid));
+ }
+ writer.close();
+ long end = System.currentTimeMillis();
+ log.info("Finished BallKMeans. Took {}.", (end - start) / 1000.0);
+ return 0;
+ }
+
+ public static int runMapReduce(Configuration conf, Path input, Path output)
+ throws IOException, ClassNotFoundException, InterruptedException {
+ // Prepare Job for submission.
+ Job job = HadoopUtil.prepareJob(input, output, SequenceFileInputFormat.class,
+ StreamingKMeansMapper.class, IntWritable.class, CentroidWritable.class,
+ StreamingKMeansReducer.class, IntWritable.class, CentroidWritable.class, SequenceFileOutputFormat.class,
+ conf);
+ job.setJobName(HadoopUtil.getCustomJobName(StreamingKMeansDriver.class.getSimpleName(), job,
+ StreamingKMeansMapper.class, StreamingKMeansReducer.class));
+
+ // There is only one reducer so that the intermediate centroids get collected on one
+ // machine and are clustered in memory to get the right number of clusters.
+ job.setNumReduceTasks(1);
+
+ // Set the JAR (so that the required libraries are available) and run.
+ job.setJarByClass(StreamingKMeansDriver.class);
+
+ // Run job!
+ long start = System.currentTimeMillis();
+ if (!job.waitForCompletion(true)) {
+ return -1;
+ }
+ long end = System.currentTimeMillis();
+
+ log.info("StreamingKMeans clustering complete. Results are in {}. Took {} ms", output.toString(), end - start);
+ return 0;
+ }
+
+ /**
+ * Constructor to be used by the ToolRunner.
+ */
+ private StreamingKMeansDriver() {}
+
+ public static void main(String[] args) throws Exception {
+ ToolRunner.run(new StreamingKMeansDriver(), args);
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/mapreduce/StreamingKMeansMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/mapreduce/StreamingKMeansMapper.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/mapreduce/StreamingKMeansMapper.java
new file mode 100644
index 0000000..f12a876
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/mapreduce/StreamingKMeansMapper.java
@@ -0,0 +1,102 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.streaming.mapreduce;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.clustering.ClusteringUtils;
+import org.apache.mahout.clustering.streaming.cluster.StreamingKMeans;
+import org.apache.mahout.math.Centroid;
+import org.apache.mahout.math.VectorWritable;
+import org.apache.mahout.math.neighborhood.UpdatableSearcher;
+
+public class StreamingKMeansMapper extends Mapper<Writable, VectorWritable, IntWritable, CentroidWritable> {
+ private static final int NUM_ESTIMATE_POINTS = 1000;
+
+ /**
+ * The clusterer object used to cluster the points received by this mapper online.
+ */
+ private StreamingKMeans clusterer;
+
+ /**
+ * Number of points clustered so far.
+ */
+ private int numPoints = 0;
+
+ private boolean estimateDistanceCutoff = false;
+
+ private List<Centroid> estimatePoints;
+
+ @Override
+ public void setup(Context context) {
+ // At this point the configuration received from the Driver is assumed to be valid.
+ // No other checks are made.
+ Configuration conf = context.getConfiguration();
+ UpdatableSearcher searcher = StreamingKMeansUtilsMR.searcherFromConfiguration(conf);
+ int numClusters = conf.getInt(StreamingKMeansDriver.ESTIMATED_NUM_MAP_CLUSTERS, 1);
+ double estimatedDistanceCutoff = conf.getFloat(StreamingKMeansDriver.ESTIMATED_DISTANCE_CUTOFF,
+ StreamingKMeansDriver.INVALID_DISTANCE_CUTOFF);
+ if (estimatedDistanceCutoff == StreamingKMeansDriver.INVALID_DISTANCE_CUTOFF) {
+ estimateDistanceCutoff = true;
+ estimatePoints = new ArrayList<>();
+ }
+ // There is no way of estimating the distance cutoff unless we have some data.
+ clusterer = new StreamingKMeans(searcher, numClusters, estimatedDistanceCutoff);
+ }
+
+ private void clusterEstimatePoints() {
+ clusterer.setDistanceCutoff(ClusteringUtils.estimateDistanceCutoff(
+ estimatePoints, clusterer.getDistanceMeasure()));
+ clusterer.cluster(estimatePoints);
+ estimateDistanceCutoff = false;
+ }
+
+ @Override
+ public void map(Writable key, VectorWritable point, Context context) {
+ Centroid centroid = new Centroid(numPoints++, point.get(), 1);
+ if (estimateDistanceCutoff) {
+ if (numPoints < NUM_ESTIMATE_POINTS) {
+ estimatePoints.add(centroid);
+ } else if (numPoints == NUM_ESTIMATE_POINTS) {
+ clusterEstimatePoints();
+ }
+ } else {
+ clusterer.cluster(centroid);
+ }
+ }
+
+ @Override
+ public void cleanup(Context context) throws IOException, InterruptedException {
+ // We should cluster the points at the end if they haven't yet been clustered.
+ if (estimateDistanceCutoff) {
+ clusterEstimatePoints();
+ }
+ // Reindex the centroids before passing them to the reducer.
+ clusterer.reindexCentroids();
+ // All outputs have the same key to go to the same final reducer.
+ for (Centroid centroid : clusterer) {
+ context.write(new IntWritable(0), new CentroidWritable(centroid));
+ }
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/mapreduce/StreamingKMeansReducer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/mapreduce/StreamingKMeansReducer.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/mapreduce/StreamingKMeansReducer.java
new file mode 100644
index 0000000..2b78acc
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/mapreduce/StreamingKMeansReducer.java
@@ -0,0 +1,109 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.streaming.mapreduce;
+
+import java.io.IOException;
+import java.util.List;
+
+import com.google.common.base.Function;
+import com.google.common.base.Preconditions;
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Lists;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.mahout.clustering.streaming.cluster.BallKMeans;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.math.Centroid;
+import org.apache.mahout.math.Vector;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class StreamingKMeansReducer extends Reducer<IntWritable, CentroidWritable, IntWritable, CentroidWritable> {
+
+ private static final Logger log = LoggerFactory.getLogger(StreamingKMeansReducer.class);
+
+ /**
+ * Configuration for the MapReduce job.
+ */
+ private Configuration conf;
+
+ @Override
+ public void setup(Context context) {
+ // At this point the configuration received from the Driver is assumed to be valid.
+ // No other checks are made.
+ conf = context.getConfiguration();
+ }
+
+ @Override
+ public void reduce(IntWritable key, Iterable<CentroidWritable> centroids,
+ Context context) throws IOException, InterruptedException {
+ List<Centroid> intermediateCentroids;
+ // There might be too many intermediate centroids to fit into memory, in which case, we run another pass
+ // of StreamingKMeans to collapse the clusters further.
+ if (conf.getBoolean(StreamingKMeansDriver.REDUCE_STREAMING_KMEANS, false)) {
+ intermediateCentroids = Lists.newArrayList(
+ new StreamingKMeansThread(Iterables.transform(centroids, new Function<CentroidWritable, Centroid>() {
+ @Override
+ public Centroid apply(CentroidWritable input) {
+ Preconditions.checkNotNull(input);
+ return input.getCentroid().clone();
+ }
+ }), conf).call());
+ } else {
+ intermediateCentroids = centroidWritablesToList(centroids);
+ }
+
+ int index = 0;
+ for (Vector centroid : getBestCentroids(intermediateCentroids, conf)) {
+ context.write(new IntWritable(index), new CentroidWritable((Centroid) centroid));
+ ++index;
+ }
+ }
+
+ public static List<Centroid> centroidWritablesToList(Iterable<CentroidWritable> centroids) {
+ // A new list must be created because Hadoop iterators mutate the contents of the Writable in
+ // place, without allocating new references when iterating through the centroids Iterable.
+ return Lists.newArrayList(Iterables.transform(centroids, new Function<CentroidWritable, Centroid>() {
+ @Override
+ public Centroid apply(CentroidWritable input) {
+ Preconditions.checkNotNull(input);
+ return input.getCentroid().clone();
+ }
+ }));
+ }
+
+ public static Iterable<Vector> getBestCentroids(List<Centroid> centroids, Configuration conf) {
+
+ if (log.isInfoEnabled()) {
+ log.info("Number of Centroids: {}", centroids.size());
+ }
+
+ int numClusters = conf.getInt(DefaultOptionCreator.NUM_CLUSTERS_OPTION, 1);
+ int maxNumIterations = conf.getInt(StreamingKMeansDriver.MAX_NUM_ITERATIONS, 10);
+ float trimFraction = conf.getFloat(StreamingKMeansDriver.TRIM_FRACTION, 0.9f);
+ boolean kMeansPlusPlusInit = !conf.getBoolean(StreamingKMeansDriver.RANDOM_INIT, false);
+ boolean correctWeights = !conf.getBoolean(StreamingKMeansDriver.IGNORE_WEIGHTS, false);
+ float testProbability = conf.getFloat(StreamingKMeansDriver.TEST_PROBABILITY, 0.1f);
+ int numRuns = conf.getInt(StreamingKMeansDriver.NUM_BALLKMEANS_RUNS, 3);
+
+ BallKMeans ballKMeansCluster = new BallKMeans(StreamingKMeansUtilsMR.searcherFromConfiguration(conf),
+ numClusters, maxNumIterations, trimFraction, kMeansPlusPlusInit, correctWeights, testProbability, numRuns);
+ return ballKMeansCluster.cluster(centroids);
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/mapreduce/StreamingKMeansThread.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/mapreduce/StreamingKMeansThread.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/mapreduce/StreamingKMeansThread.java
new file mode 100644
index 0000000..24cc1db
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/mapreduce/StreamingKMeansThread.java
@@ -0,0 +1,92 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.streaming.mapreduce;
+
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import java.util.concurrent.Callable;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.mahout.clustering.ClusteringUtils;
+import org.apache.mahout.clustering.streaming.cluster.StreamingKMeans;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileValueIterable;
+import org.apache.mahout.math.Centroid;
+import org.apache.mahout.math.VectorWritable;
+import org.apache.mahout.math.neighborhood.UpdatableSearcher;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class StreamingKMeansThread implements Callable<Iterable<Centroid>> {
+ private static final Logger log = LoggerFactory.getLogger(StreamingKMeansThread.class);
+
+ private static final int NUM_ESTIMATE_POINTS = 1000;
+
+ private final Configuration conf;
+ private final Iterable<Centroid> dataPoints;
+
+ public StreamingKMeansThread(Path input, Configuration conf) {
+ this(StreamingKMeansUtilsMR.getCentroidsFromVectorWritable(
+ new SequenceFileValueIterable<VectorWritable>(input, false, conf)), conf);
+ }
+
+ public StreamingKMeansThread(Iterable<Centroid> dataPoints, Configuration conf) {
+ this.dataPoints = dataPoints;
+ this.conf = conf;
+ }
+
+ @Override
+ public Iterable<Centroid> call() {
+ UpdatableSearcher searcher = StreamingKMeansUtilsMR.searcherFromConfiguration(conf);
+ int numClusters = conf.getInt(StreamingKMeansDriver.ESTIMATED_NUM_MAP_CLUSTERS, 1);
+ double estimateDistanceCutoff = conf.getFloat(StreamingKMeansDriver.ESTIMATED_DISTANCE_CUTOFF,
+ StreamingKMeansDriver.INVALID_DISTANCE_CUTOFF);
+
+ Iterator<Centroid> dataPointsIterator = dataPoints.iterator();
+
+ if (estimateDistanceCutoff == StreamingKMeansDriver.INVALID_DISTANCE_CUTOFF) {
+ List<Centroid> estimatePoints = new ArrayList<>(NUM_ESTIMATE_POINTS);
+ while (dataPointsIterator.hasNext() && estimatePoints.size() < NUM_ESTIMATE_POINTS) {
+ Centroid centroid = dataPointsIterator.next();
+ estimatePoints.add(centroid);
+ }
+
+ if (log.isInfoEnabled()) {
+ log.info("Estimated Points: {}", estimatePoints.size());
+ }
+ estimateDistanceCutoff = ClusteringUtils.estimateDistanceCutoff(estimatePoints, searcher.getDistanceMeasure());
+ }
+
+ StreamingKMeans streamingKMeans = new StreamingKMeans(searcher, numClusters, estimateDistanceCutoff);
+
+ // datapointsIterator could be empty if no estimate distance was initially provided
+ // hence creating the iterator again here for the clustering
+ if (!dataPointsIterator.hasNext()) {
+ dataPointsIterator = dataPoints.iterator();
+ }
+
+ while (dataPointsIterator.hasNext()) {
+ streamingKMeans.cluster(dataPointsIterator.next());
+ }
+
+ streamingKMeans.reindexCentroids();
+ return streamingKMeans;
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/mapreduce/StreamingKMeansUtilsMR.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/mapreduce/StreamingKMeansUtilsMR.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/mapreduce/StreamingKMeansUtilsMR.java
new file mode 100644
index 0000000..f00cf56
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/mapreduce/StreamingKMeansUtilsMR.java
@@ -0,0 +1,154 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.streaming.mapreduce;
+
+import java.io.IOException;
+
+import com.google.common.base.Function;
+import com.google.common.base.Preconditions;
+import com.google.common.collect.Iterables;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.mahout.common.ClassUtils;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.math.Centroid;
+import org.apache.mahout.math.RandomAccessSparseVector;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+import org.apache.mahout.math.neighborhood.BruteSearch;
+import org.apache.mahout.math.neighborhood.FastProjectionSearch;
+import org.apache.mahout.math.neighborhood.LocalitySensitiveHashSearch;
+import org.apache.mahout.math.neighborhood.ProjectionSearch;
+import org.apache.mahout.math.neighborhood.UpdatableSearcher;
+
+public final class StreamingKMeansUtilsMR {
+
+ private StreamingKMeansUtilsMR() {
+ }
+
+ /**
+ * Instantiates a searcher from a given configuration.
+ * @param conf the configuration
+ * @return the instantiated searcher
+ * @throws RuntimeException if the distance measure class cannot be instantiated
+ * @throws IllegalStateException if an unknown searcher class was requested
+ */
+ public static UpdatableSearcher searcherFromConfiguration(Configuration conf) {
+ DistanceMeasure distanceMeasure;
+ String distanceMeasureClass = conf.get(DefaultOptionCreator.DISTANCE_MEASURE_OPTION);
+ try {
+ distanceMeasure = (DistanceMeasure) Class.forName(distanceMeasureClass).getConstructor().newInstance();
+ } catch (Exception e) {
+ throw new RuntimeException("Failed to instantiate distanceMeasure", e);
+ }
+
+ int numProjections = conf.getInt(StreamingKMeansDriver.NUM_PROJECTIONS_OPTION, 20);
+ int searchSize = conf.getInt(StreamingKMeansDriver.SEARCH_SIZE_OPTION, 10);
+
+ String searcherClass = conf.get(StreamingKMeansDriver.SEARCHER_CLASS_OPTION);
+
+ if (searcherClass.equals(BruteSearch.class.getName())) {
+ return ClassUtils.instantiateAs(searcherClass, UpdatableSearcher.class,
+ new Class[]{DistanceMeasure.class}, new Object[]{distanceMeasure});
+ } else if (searcherClass.equals(FastProjectionSearch.class.getName())
+ || searcherClass.equals(ProjectionSearch.class.getName())) {
+ return ClassUtils.instantiateAs(searcherClass, UpdatableSearcher.class,
+ new Class[]{DistanceMeasure.class, int.class, int.class},
+ new Object[]{distanceMeasure, numProjections, searchSize});
+ } else if (searcherClass.equals(LocalitySensitiveHashSearch.class.getName())) {
+ return ClassUtils.instantiateAs(searcherClass, LocalitySensitiveHashSearch.class,
+ new Class[]{DistanceMeasure.class, int.class},
+ new Object[]{distanceMeasure, searchSize});
+ } else {
+ throw new IllegalStateException("Unknown class instantiation requested");
+ }
+ }
+
+ /**
+ * Returns an Iterable of centroids from an Iterable of VectorWritables by creating a new Centroid containing
+ * a RandomAccessSparseVector as a delegate for each VectorWritable.
+ * @param inputIterable VectorWritable Iterable to get Centroids from
+ * @return the new Centroids
+ */
+ public static Iterable<Centroid> getCentroidsFromVectorWritable(Iterable<VectorWritable> inputIterable) {
+ return Iterables.transform(inputIterable, new Function<VectorWritable, Centroid>() {
+ private int numVectors = 0;
+ @Override
+ public Centroid apply(VectorWritable input) {
+ Preconditions.checkNotNull(input);
+ return new Centroid(numVectors++, new RandomAccessSparseVector(input.get()), 1);
+ }
+ });
+ }
+
+ /**
+ * Returns an Iterable of Centroid from an Iterable of Vector by either casting each Vector to Centroid (if the
+ * instance extends Centroid) or create a new Centroid based on that Vector.
+ * The implicit expectation is that the input will not have interleaving types of vectors. Otherwise, the numbering
+ * of new Centroids will become invalid.
+ * @param input Iterable of Vectors to cast
+ * @return the new Centroids
+ */
+ public static Iterable<Centroid> castVectorsToCentroids(Iterable<Vector> input) {
+ return Iterables.transform(input, new Function<Vector, Centroid>() {
+ private int numVectors = 0;
+ @Override
+ public Centroid apply(Vector input) {
+ Preconditions.checkNotNull(input);
+ if (input instanceof Centroid) {
+ return (Centroid) input;
+ } else {
+ return new Centroid(numVectors++, input, 1);
+ }
+ }
+ });
+ }
+
+ /**
+ * Writes centroids to a sequence file.
+ * @param centroids the centroids to write.
+ * @param path the path of the output file.
+ * @param conf the configuration for the HDFS to write the file to.
+ * @throws java.io.IOException
+ */
+ public static void writeCentroidsToSequenceFile(Iterable<Centroid> centroids, Path path, Configuration conf)
+ throws IOException {
+ try (SequenceFile.Writer writer = SequenceFile.createWriter(FileSystem.get(conf), conf,
+ path, IntWritable.class, CentroidWritable.class)) {
+ int i = 0;
+ for (Centroid centroid : centroids) {
+ writer.append(new IntWritable(i++), new CentroidWritable(centroid));
+ }
+ }
+ }
+
+ public static void writeVectorsToSequenceFile(Iterable<? extends Vector> datapoints, Path path, Configuration conf)
+ throws IOException {
+ try (SequenceFile.Writer writer = SequenceFile.createWriter(FileSystem.get(conf), conf,
+ path, IntWritable.class, VectorWritable.class)){
+ int i = 0;
+ for (Vector vector : datapoints) {
+ writer.append(new IntWritable(i++), new VectorWritable(vector));
+ }
+ }
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/tools/ResplitSequenceFiles.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/tools/ResplitSequenceFiles.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/tools/ResplitSequenceFiles.java
new file mode 100644
index 0000000..d7ca554
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/tools/ResplitSequenceFiles.java
@@ -0,0 +1,149 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.streaming.tools;
+
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.PrintWriter;
+import java.util.Iterator;
+
+import com.google.common.collect.Iterables;
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.commandline.Parser;
+import org.apache.commons.cli2.util.HelpFormatter;
+import org.apache.commons.io.Charsets;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
+
+public class ResplitSequenceFiles {
+
+ private String inputFile;
+ private String outputFileBase;
+ private int numSplits;
+
+ private Configuration conf;
+ private FileSystem fs;
+
+ private ResplitSequenceFiles() {}
+
+ private void writeSplit(Iterator<Pair<Writable, Writable>> inputIterator,
+ int numSplit, int numEntriesPerSplit) throws IOException {
+ SequenceFile.Writer splitWriter = null;
+ for (int j = 0; j < numEntriesPerSplit; ++j) {
+ Pair<Writable, Writable> item = inputIterator.next();
+ if (splitWriter == null) {
+ splitWriter = SequenceFile.createWriter(fs, conf,
+ new Path(outputFileBase + "-" + numSplit), item.getFirst().getClass(), item.getSecond().getClass());
+ }
+ splitWriter.append(item.getFirst(), item.getSecond());
+ }
+ if (splitWriter != null) {
+ splitWriter.close();
+ }
+ }
+
+ private void run(PrintWriter printWriter) throws IOException {
+ conf = new Configuration();
+ SequenceFileDirIterable<Writable, Writable> inputIterable = new
+ SequenceFileDirIterable<>(new Path(inputFile), PathType.LIST, conf);
+ fs = FileSystem.get(conf);
+
+ int numEntries = Iterables.size(inputIterable);
+ int numEntriesPerSplit = numEntries / numSplits;
+ int numEntriesLastSplit = numEntriesPerSplit + numEntries - numEntriesPerSplit * numSplits;
+ Iterator<Pair<Writable, Writable>> inputIterator = inputIterable.iterator();
+
+ printWriter.printf("Writing %d splits\n", numSplits);
+ for (int i = 0; i < numSplits - 1; ++i) {
+ printWriter.printf("Writing split %d\n", i);
+ writeSplit(inputIterator, i, numEntriesPerSplit);
+ }
+ printWriter.printf("Writing split %d\n", numSplits - 1);
+ writeSplit(inputIterator, numSplits - 1, numEntriesLastSplit);
+ }
+
+ private boolean parseArgs(String[] args) {
+ DefaultOptionBuilder builder = new DefaultOptionBuilder();
+
+ Option help = builder.withLongName("help").withDescription("print this list").create();
+
+ ArgumentBuilder argumentBuilder = new ArgumentBuilder();
+ Option inputFileOption = builder.withLongName("input")
+ .withShortName("i")
+ .withRequired(true)
+ .withArgument(argumentBuilder.withName("input").withMaximum(1).create())
+ .withDescription("what the base folder for sequence files is (they all must have the same key/value type")
+ .create();
+
+ Option outputFileOption = builder.withLongName("output")
+ .withShortName("o")
+ .withRequired(true)
+ .withArgument(argumentBuilder.withName("output").withMaximum(1).create())
+ .withDescription("the base name of the file split that the files will be split it; the i'th split has the "
+ + "suffix -i")
+ .create();
+
+ Option numSplitsOption = builder.withLongName("numSplits")
+ .withShortName("ns")
+ .withRequired(true)
+ .withArgument(argumentBuilder.withName("numSplits").withMaximum(1).create())
+ .withDescription("how many splits to use for the given files")
+ .create();
+
+ Group normalArgs = new GroupBuilder()
+ .withOption(help)
+ .withOption(inputFileOption)
+ .withOption(outputFileOption)
+ .withOption(numSplitsOption)
+ .create();
+
+ Parser parser = new Parser();
+ parser.setHelpOption(help);
+ parser.setHelpTrigger("--help");
+ parser.setGroup(normalArgs);
+ parser.setHelpFormatter(new HelpFormatter(" ", "", " ", 130));
+ CommandLine cmdLine = parser.parseAndHelp(args);
+
+ if (cmdLine == null) {
+ return false;
+ }
+
+ inputFile = (String) cmdLine.getValue(inputFileOption);
+ outputFileBase = (String) cmdLine.getValue(outputFileOption);
+ numSplits = Integer.parseInt((String) cmdLine.getValue(numSplitsOption));
+ return true;
+ }
+
+ public static void main(String[] args) throws IOException {
+ ResplitSequenceFiles runner = new ResplitSequenceFiles();
+ if (runner.parseArgs(args)) {
+ runner.run(new PrintWriter(new OutputStreamWriter(System.out, Charsets.UTF_8), true));
+ }
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/topdown/PathDirectory.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/topdown/PathDirectory.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/topdown/PathDirectory.java
new file mode 100644
index 0000000..11bc34a
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/topdown/PathDirectory.java
@@ -0,0 +1,94 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.topdown;
+
+import java.io.File;
+
+import org.apache.hadoop.fs.Path;
+
+/**
+ * Contains list of all internal paths used in top down clustering.
+ */
+public final class PathDirectory {
+
+ public static final String TOP_LEVEL_CLUSTER_DIRECTORY = "topLevelCluster";
+ public static final String POST_PROCESS_DIRECTORY = "clusterPostProcessed";
+ public static final String CLUSTERED_POINTS_DIRECTORY = "clusteredPoints";
+ public static final String BOTTOM_LEVEL_CLUSTER_DIRECTORY = "bottomLevelCluster";
+
+ private PathDirectory() {
+ }
+
+ /**
+ * All output of top level clustering is stored in output directory/topLevelCluster.
+ *
+ * @param output
+ * the output path of clustering.
+ * @return The top level Cluster Directory.
+ */
+ public static Path getTopLevelClusterPath(Path output) {
+ return new Path(output + File.separator + TOP_LEVEL_CLUSTER_DIRECTORY);
+ }
+
+ /**
+ * The output of top level clusters is post processed and kept in this path.
+ *
+ * @param outputPathProvidedByUser
+ * the output path of clustering.
+ * @return the path where the output of top level cluster post processor is kept.
+ */
+ public static Path getClusterPostProcessorOutputDirectory(Path outputPathProvidedByUser) {
+ return new Path(outputPathProvidedByUser + File.separator + POST_PROCESS_DIRECTORY);
+ }
+
+ /**
+ * The top level clustered points before post processing is generated here.
+ *
+ * @param output
+ * the output path of clustering.
+ * @return the clustered points directory
+ */
+ public static Path getClusterOutputClusteredPoints(Path output) {
+ return new Path(output + File.separator + CLUSTERED_POINTS_DIRECTORY + File.separator, "*");
+ }
+
+ /**
+ * Each cluster produced by top level clustering is processed in output/"bottomLevelCluster"/clusterId.
+ *
+ * @param output
+ * @param clusterId
+ * @return the bottom level clustering path.
+ */
+ public static Path getBottomLevelClusterPath(Path output, String clusterId) {
+ return new Path(output + File.separator + BOTTOM_LEVEL_CLUSTER_DIRECTORY + File.separator + clusterId);
+ }
+
+ /**
+ * Each clusters path name is its clusterId. The vectors reside in separate files inside it.
+ *
+ * @param clusterPostProcessorOutput
+ * the path of cluster post processor output.
+ * @param clusterId
+ * the id of the cluster.
+ * @return the cluster path for cluster id.
+ */
+ public static Path getClusterPathForClusterId(Path clusterPostProcessorOutput, String clusterId) {
+ return new Path(clusterPostProcessorOutput + File.separator + clusterId);
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterCountReader.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterCountReader.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterCountReader.java
new file mode 100644
index 0000000..d0563fd
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterCountReader.java
@@ -0,0 +1,103 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.topdown.postprocessor;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.clustering.iterator.ClusterWritable;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
+import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterator;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Map;
+
+/**
+ * Reads the number of clusters produced by the clustering algorithm.
+ */
+public final class ClusterCountReader {
+
+ private ClusterCountReader() {
+ }
+
+ /**
+ * Reads the number of clusters present by reading the clusters-*-final file.
+ *
+ * @param clusterOutputPath The output path provided to the clustering algorithm.
+ * @param conf The hadoop configuration.
+ * @return the number of final clusters.
+ */
+ public static int getNumberOfClusters(Path clusterOutputPath, Configuration conf) throws IOException {
+ FileSystem fileSystem = clusterOutputPath.getFileSystem(conf);
+ FileStatus[] clusterFiles = fileSystem.listStatus(clusterOutputPath, PathFilters.finalPartFilter());
+ int numberOfClusters = 0;
+ Iterator<?> it = new SequenceFileDirValueIterator<>(clusterFiles[0].getPath(),
+ PathType.LIST,
+ PathFilters.partFilter(),
+ null,
+ true,
+ conf);
+ while (it.hasNext()) {
+ it.next();
+ numberOfClusters++;
+ }
+ return numberOfClusters;
+ }
+
+ /**
+ * Generates a list of all cluster ids by reading the clusters-*-final file.
+ *
+ * @param clusterOutputPath The output path provided to the clustering algorithm.
+ * @param conf The hadoop configuration.
+ * @return An ArrayList containing the final cluster ids.
+ */
+ public static Map<Integer, Integer> getClusterIDs(Path clusterOutputPath, Configuration conf, boolean keyIsClusterId)
+ throws IOException {
+ Map<Integer, Integer> clusterIds = new HashMap<>();
+ FileSystem fileSystem = clusterOutputPath.getFileSystem(conf);
+ FileStatus[] clusterFiles = fileSystem.listStatus(clusterOutputPath, PathFilters.finalPartFilter());
+ //System.out.println("LOOK HERE: " + clusterOutputPath);
+ Iterator<ClusterWritable> it = new SequenceFileDirValueIterator<>(clusterFiles[0].getPath(),
+ PathType.LIST,
+ PathFilters.partFilter(),
+ null,
+ true,
+ conf);
+ int i = 0;
+ while (it.hasNext()) {
+ Integer key;
+ Integer value;
+ if (keyIsClusterId) { // key is the cluster id, value is i, the index we will use
+ key = it.next().getValue().getId();
+ value = i;
+ } else {
+ key = i;
+ value = it.next().getValue().getId();
+ }
+ clusterIds.put(key, value);
+ i++;
+ }
+ return clusterIds;
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterOutputPostProcessor.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterOutputPostProcessor.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterOutputPostProcessor.java
new file mode 100644
index 0000000..ded76ad
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterOutputPostProcessor.java
@@ -0,0 +1,139 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.topdown.postprocessor;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.SequenceFile.Writer;
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.clustering.classify.WeightedVectorWritable;
+import org.apache.mahout.clustering.topdown.PathDirectory;
+import org.apache.mahout.common.IOUtils;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
+import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
+import org.apache.mahout.math.VectorWritable;
+
+/**
+ * This class reads the output of any clustering algorithm, and, creates separate directories for different
+ * clusters. Each cluster directory's name is its clusterId. Each and every point is written in the cluster
+ * directory associated with that point.
+ * <p/>
+ * This class incorporates a sequential algorithm and is appropriate for use for data which has been clustered
+ * sequentially.
+ * <p/>
+ * The sequential and non sequential version, both are being used from {@link ClusterOutputPostProcessorDriver}.
+ */
+public final class ClusterOutputPostProcessor {
+
+ private Path clusteredPoints;
+ private final FileSystem fileSystem;
+ private final Configuration conf;
+ private final Path clusterPostProcessorOutput;
+ private final Map<String, Path> postProcessedClusterDirectories = new HashMap<>();
+ private long uniqueVectorId = 0L;
+ private final Map<String, SequenceFile.Writer> writersForClusters;
+
+ public ClusterOutputPostProcessor(Path clusterOutputToBeProcessed,
+ Path output,
+ Configuration hadoopConfiguration) throws IOException {
+ this.clusterPostProcessorOutput = output;
+ this.clusteredPoints = PathDirectory.getClusterOutputClusteredPoints(clusterOutputToBeProcessed);
+ this.conf = hadoopConfiguration;
+ this.writersForClusters = new HashMap<>();
+ fileSystem = clusteredPoints.getFileSystem(conf);
+ }
+
+ /**
+ * This method takes the clustered points output by the clustering algorithms as input and writes them into
+ * their respective clusters.
+ */
+ public void process() throws IOException {
+ createPostProcessDirectory();
+ for (Pair<?, WeightedVectorWritable> record
+ : new SequenceFileDirIterable<Writable, WeightedVectorWritable>(clusteredPoints, PathType.GLOB, PathFilters.partFilter(),
+ null, false, conf)) {
+ String clusterId = record.getFirst().toString().trim();
+ putVectorInRespectiveCluster(clusterId, record.getSecond());
+ }
+ IOUtils.close(writersForClusters.values());
+ writersForClusters.clear();
+ }
+
+ /**
+ * Creates the directory to put post processed clusters.
+ */
+ private void createPostProcessDirectory() throws IOException {
+ if (!fileSystem.exists(clusterPostProcessorOutput)
+ && !fileSystem.mkdirs(clusterPostProcessorOutput)) {
+ throw new IOException("Error creating cluster post processor directory");
+ }
+ }
+
+ /**
+ * Finds out the cluster directory of the vector and writes it into the specified cluster.
+ */
+ private void putVectorInRespectiveCluster(String clusterId, WeightedVectorWritable point) throws IOException {
+ Writer writer = findWriterForVector(clusterId);
+ postProcessedClusterDirectories.put(clusterId,
+ PathDirectory.getClusterPathForClusterId(clusterPostProcessorOutput, clusterId));
+ writeVectorToCluster(writer, point);
+ }
+
+ /**
+ * Finds out the path in cluster where the point is supposed to be written.
+ */
+ private Writer findWriterForVector(String clusterId) throws IOException {
+ Path clusterDirectory = PathDirectory.getClusterPathForClusterId(clusterPostProcessorOutput, clusterId);
+ Writer writer = writersForClusters.get(clusterId);
+ if (writer == null) {
+ Path pathToWrite = new Path(clusterDirectory, new Path("part-m-0"));
+ writer = new Writer(fileSystem, conf, pathToWrite, LongWritable.class, VectorWritable.class);
+ writersForClusters.put(clusterId, writer);
+ }
+ return writer;
+ }
+
+ /**
+ * Writes vector to the cluster directory.
+ */
+ private void writeVectorToCluster(Writer writer, WeightedVectorWritable point) throws IOException {
+ writer.append(new LongWritable(uniqueVectorId++), new VectorWritable(point.getVector()));
+ writer.sync();
+ }
+
+ /**
+ * @return the set of all post processed cluster paths.
+ */
+ public Map<String, Path> getPostProcessedClusterDirectories() {
+ return postProcessedClusterDirectories;
+ }
+
+ public void setClusteredPoints(Path clusteredPoints) {
+ this.clusteredPoints = clusteredPoints;
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterOutputPostProcessorDriver.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterOutputPostProcessorDriver.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterOutputPostProcessorDriver.java
new file mode 100644
index 0000000..82a3071
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterOutputPostProcessorDriver.java
@@ -0,0 +1,182 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.topdown.postprocessor;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterator;
+import org.apache.mahout.math.VectorWritable;
+
+import java.io.IOException;
+
+/**
+ * Post processes the output of clustering algorithms and groups them into respective clusters. Ideal to be
+ * used for top down clustering. It can also be used if the clustering output needs to be grouped into their
+ * respective clusters.
+ */
+public final class ClusterOutputPostProcessorDriver extends AbstractJob {
+
+ /**
+ * CLI to run clustering post processor. The input to post processor is the ouput path specified to the
+ * clustering.
+ */
+ @Override
+ public int run(String[] args) throws Exception {
+ addInputOption();
+ addOutputOption();
+ addOption(DefaultOptionCreator.methodOption().create());
+ addOption(DefaultOptionCreator.overwriteOption().create());
+
+ if (parseArguments(args) == null) {
+ return -1;
+ }
+ Path input = getInputPath();
+ Path output = getOutputPath();
+
+ if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
+ HadoopUtil.delete(getConf(), output);
+ }
+ boolean runSequential = getOption(DefaultOptionCreator.METHOD_OPTION).equalsIgnoreCase(
+ DefaultOptionCreator.SEQUENTIAL_METHOD);
+ run(input, output, runSequential);
+ return 0;
+
+ }
+
+ public static void main(String[] args) throws Exception {
+ ToolRunner.run(new Configuration(), new ClusterOutputPostProcessorDriver(), args);
+ }
+
+ /**
+ * Post processes the output of clustering algorithms and groups them into respective clusters. Each
+ * cluster's vectors are written into a directory named after its clusterId.
+ *
+ * @param input The output path provided to the clustering algorithm, whose would be post processed. Hint: The
+ * path of the directory containing clusters-*-final and clusteredPoints.
+ * @param output The post processed data would be stored at this path.
+ * @param runSequential If set to true, post processes it sequentially, else, uses. MapReduce. Hint: If the clustering
+ * was done sequentially, make it sequential, else vice versa.
+ */
+ public static void run(Path input, Path output, boolean runSequential) throws IOException,
+ InterruptedException,
+ ClassNotFoundException {
+ if (runSequential) {
+ postProcessSeq(input, output);
+ } else {
+ Configuration conf = new Configuration();
+ postProcessMR(conf, input, output);
+ movePartFilesToRespectiveDirectories(conf, output);
+ }
+
+ }
+
+ /**
+ * Process Sequentially. Reads the vectors one by one, and puts them into respective directory, named after
+ * their clusterId.
+ *
+ * @param input The output path provided to the clustering algorithm, whose would be post processed. Hint : The
+ * path of the directory containing clusters-*-final and clusteredPoints.
+ * @param output The post processed data would be stored at this path.
+ */
+ private static void postProcessSeq(Path input, Path output) throws IOException {
+ ClusterOutputPostProcessor clusterOutputPostProcessor = new ClusterOutputPostProcessor(input, output,
+ new Configuration());
+ clusterOutputPostProcessor.process();
+ }
+
+ /**
+ * Process as a map reduce job. The numberOfReduceTasks is set to the number of clusters present in the
+ * output. So that each cluster's vector is written in its own part file.
+ *
+ * @param conf The hadoop configuration.
+ * @param input The output path provided to the clustering algorithm, whose would be post processed. Hint : The
+ * path of the directory containing clusters-*-final and clusteredPoints.
+ * @param output The post processed data would be stored at this path.
+ */
+ private static void postProcessMR(Configuration conf, Path input, Path output) throws IOException,
+ InterruptedException,
+ ClassNotFoundException {
+ System.out.println("WARNING: If you are running in Hadoop local mode, please use the --sequential option, "
+ + "as the MapReduce option will not work properly");
+ int numberOfClusters = ClusterCountReader.getNumberOfClusters(input, conf);
+ conf.set("clusterOutputPath", input.toString());
+ Job job = new Job(conf, "ClusterOutputPostProcessor Driver running over input: " + input);
+ job.setInputFormatClass(SequenceFileInputFormat.class);
+ job.setOutputFormatClass(SequenceFileOutputFormat.class);
+ job.setMapperClass(ClusterOutputPostProcessorMapper.class);
+ job.setMapOutputKeyClass(IntWritable.class);
+ job.setMapOutputValueClass(VectorWritable.class);
+ job.setReducerClass(ClusterOutputPostProcessorReducer.class);
+ job.setOutputKeyClass(IntWritable.class);
+ job.setOutputValueClass(VectorWritable.class);
+ job.setNumReduceTasks(numberOfClusters);
+ job.setJarByClass(ClusterOutputPostProcessorDriver.class);
+
+ FileInputFormat.addInputPath(job, new Path(input, new Path("clusteredPoints")));
+ FileOutputFormat.setOutputPath(job, output);
+ if (!job.waitForCompletion(true)) {
+ throw new InterruptedException("ClusterOutputPostProcessor Job failed processing " + input);
+ }
+ }
+
+ /**
+ * The mapreduce version of the post processor writes different clusters into different part files. This
+ * method reads the part files and moves them into directories named after their clusterIds.
+ *
+ * @param conf The hadoop configuration.
+ * @param output The post processed data would be stored at this path.
+ */
+ private static void movePartFilesToRespectiveDirectories(Configuration conf, Path output) throws IOException {
+ FileSystem fileSystem = output.getFileSystem(conf);
+ for (FileStatus fileStatus : fileSystem.listStatus(output, PathFilters.partFilter())) {
+ SequenceFileIterator<Writable, Writable> it =
+ new SequenceFileIterator<>(fileStatus.getPath(), true, conf);
+ if (it.hasNext()) {
+ renameFile(it.next().getFirst(), fileStatus, conf);
+ }
+ it.close();
+ }
+ }
+
+ /**
+ * Using @FileSystem rename method to move the file.
+ */
+ private static void renameFile(Writable key, FileStatus fileStatus, Configuration conf) throws IOException {
+ Path path = fileStatus.getPath();
+ FileSystem fileSystem = path.getFileSystem(conf);
+ Path subDir = new Path(key.toString());
+ Path renameTo = new Path(path.getParent(), subDir);
+ fileSystem.mkdirs(renameTo);
+ fileSystem.rename(path, renameTo);
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterOutputPostProcessorMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterOutputPostProcessorMapper.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterOutputPostProcessorMapper.java
new file mode 100644
index 0000000..6834362
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterOutputPostProcessorMapper.java
@@ -0,0 +1,58 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.topdown.postprocessor;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.clustering.classify.WeightedVectorWritable;
+import org.apache.mahout.math.VectorWritable;
+
+import java.io.IOException;
+import java.util.Map;
+
+/**
+ * Mapper for post processing cluster output.
+ */
+public class ClusterOutputPostProcessorMapper extends
+ Mapper<IntWritable, WeightedVectorWritable, IntWritable, VectorWritable> {
+
+ private Map<Integer, Integer> newClusterMappings;
+ private VectorWritable outputVector;
+
+ //read the current cluster ids, and populate the cluster mapping hash table
+ @Override
+ public void setup(Context context) throws IOException {
+ Configuration conf = context.getConfiguration();
+ //this give the clusters-x-final directory where the cluster ids can be read
+ Path clusterOutputPath = new Path(conf.get("clusterOutputPath"));
+ //we want the key to be the cluster id, the value to be the index
+ newClusterMappings = ClusterCountReader.getClusterIDs(clusterOutputPath, conf, true);
+ outputVector = new VectorWritable();
+ }
+
+ @Override
+ public void map(IntWritable key, WeightedVectorWritable val, Context context)
+ throws IOException, InterruptedException {
+ // by pivoting on the cluster mapping value, we can make sure that each unique cluster goes to it's own reducer,
+ // since they are numbered from 0 to k-1, where k is the number of clusters
+ outputVector.set(val.getVector());
+ context.write(new IntWritable(newClusterMappings.get(key.get())), outputVector);
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterOutputPostProcessorReducer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterOutputPostProcessorReducer.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterOutputPostProcessorReducer.java
new file mode 100644
index 0000000..58dada4
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/topdown/postprocessor/ClusterOutputPostProcessorReducer.java
@@ -0,0 +1,62 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.topdown.postprocessor;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.mahout.math.VectorWritable;
+
+import java.io.IOException;
+import java.util.Map;
+
+/**
+ * Reducer for post processing cluster output.
+ */
+public class ClusterOutputPostProcessorReducer
+ extends Reducer<IntWritable, VectorWritable, IntWritable, VectorWritable> {
+
+ private Map<Integer, Integer> reverseClusterMappings;
+
+ //read the current cluster ids, and populate the hash cluster mapping hash table
+ @Override
+ public void setup(Context context) throws IOException {
+ Configuration conf = context.getConfiguration();
+ Path clusterOutputPath = new Path(conf.get("clusterOutputPath"));
+ //we want to the key to be the index, the value to be the cluster id
+ reverseClusterMappings = ClusterCountReader.getClusterIDs(clusterOutputPath, conf, false);
+ }
+
+ /**
+ * The key is the remapped cluster id and the values contains the vectors in that cluster.
+ */
+ @Override
+ protected void reduce(IntWritable key, Iterable<VectorWritable> values, Context context) throws IOException,
+ InterruptedException {
+ //remap the cluster back to its original id
+ //and then output the vectors with their correct
+ //cluster id.
+ IntWritable outKey = new IntWritable(reverseClusterMappings.get(key.get()));
+ System.out.println(outKey + " this: " + this);
+ for (VectorWritable value : values) {
+ context.write(outKey, value);
+ }
+ }
+
+}
r***@apache.org
2018-06-28 14:54:30 UTC
Permalink
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/commandline/DefaultOptionCreator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/commandline/DefaultOptionCreator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/commandline/DefaultOptionCreator.java
new file mode 100644
index 0000000..0e7ee96
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/commandline/DefaultOptionCreator.java
@@ -0,0 +1,417 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common.commandline;
+
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure;
+import org.apache.mahout.clustering.kernel.TriangularKernelProfile;
+
+
+public final class DefaultOptionCreator {
+
+ public static final String CLUSTERING_OPTION = "clustering";
+
+ public static final String CLUSTERS_IN_OPTION = "clusters";
+
+ public static final String CONVERGENCE_DELTA_OPTION = "convergenceDelta";
+
+ public static final String DISTANCE_MEASURE_OPTION = "distanceMeasure";
+
+ public static final String EMIT_MOST_LIKELY_OPTION = "emitMostLikely";
+
+ public static final String INPUT_OPTION = "input";
+
+ public static final String MAX_ITERATIONS_OPTION = "maxIter";
+
+ public static final String MAX_REDUCERS_OPTION = "maxRed";
+
+ public static final String METHOD_OPTION = "method";
+
+ public static final String NUM_CLUSTERS_OPTION = "numClusters";
+
+ public static final String OUTPUT_OPTION = "output";
+
+ public static final String OVERWRITE_OPTION = "overwrite";
+
+ public static final String T1_OPTION = "t1";
+
+ public static final String T2_OPTION = "t2";
+
+ public static final String T3_OPTION = "t3";
+
+ public static final String T4_OPTION = "t4";
+
+ public static final String OUTLIER_THRESHOLD = "outlierThreshold";
+
+ public static final String CLUSTER_FILTER_OPTION = "clusterFilter";
+
+ public static final String THRESHOLD_OPTION = "threshold";
+
+ public static final String SEQUENTIAL_METHOD = "sequential";
+
+ public static final String MAPREDUCE_METHOD = "mapreduce";
+
+ public static final String KERNEL_PROFILE_OPTION = "kernelProfile";
+
+ public static final String ANALYZER_NAME_OPTION = "analyzerName";
+
+ public static final String RANDOM_SEED = "randomSeed";
+
+ private DefaultOptionCreator() {}
+
+ /**
+ * Returns a default command line option for help. Used by all clustering jobs
+ * and many others
+ * */
+ public static Option helpOption() {
+ return new DefaultOptionBuilder().withLongName("help")
+ .withDescription("Print out help").withShortName("h").create();
+ }
+
+ /**
+ * Returns a default command line option for input directory specification.
+ * Used by all clustering jobs plus others
+ */
+ public static DefaultOptionBuilder inputOption() {
+ return new DefaultOptionBuilder()
+ .withLongName(INPUT_OPTION)
+ .withRequired(false)
+ .withShortName("i")
+ .withArgument(
+ new ArgumentBuilder().withName(INPUT_OPTION).withMinimum(1)
+ .withMaximum(1).create())
+ .withDescription("Path to job input directory.");
+ }
+
+ /**
+ * Returns a default command line option for clusters input directory
+ * specification. Used by FuzzyKmeans, Kmeans
+ */
+ public static DefaultOptionBuilder clustersInOption() {
+ return new DefaultOptionBuilder()
+ .withLongName(CLUSTERS_IN_OPTION)
+ .withRequired(true)
+ .withArgument(
+ new ArgumentBuilder().withName(CLUSTERS_IN_OPTION).withMinimum(1)
+ .withMaximum(1).create())
+ .withDescription(
+ "The path to the initial clusters directory. Must be a SequenceFile of some type of Cluster")
+ .withShortName("c");
+ }
+
+ /**
+ * Returns a default command line option for output directory specification.
+ * Used by all clustering jobs plus others
+ */
+ public static DefaultOptionBuilder outputOption() {
+ return new DefaultOptionBuilder()
+ .withLongName(OUTPUT_OPTION)
+ .withRequired(false)
+ .withShortName("o")
+ .withArgument(
+ new ArgumentBuilder().withName(OUTPUT_OPTION).withMinimum(1)
+ .withMaximum(1).create())
+ .withDescription("The directory pathname for output.");
+ }
+
+ /**
+ * Returns a default command line option for output directory overwriting.
+ * Used by all clustering jobs
+ */
+ public static DefaultOptionBuilder overwriteOption() {
+ return new DefaultOptionBuilder()
+ .withLongName(OVERWRITE_OPTION)
+ .withRequired(false)
+ .withDescription(
+ "If present, overwrite the output directory before running job")
+ .withShortName("ow");
+ }
+
+ /**
+ * Returns a default command line option for specification of distance measure
+ * class to use. Used by Canopy, FuzzyKmeans, Kmeans, MeanShift
+ */
+ public static DefaultOptionBuilder distanceMeasureOption() {
+ return new DefaultOptionBuilder()
+ .withLongName(DISTANCE_MEASURE_OPTION)
+ .withRequired(false)
+ .withShortName("dm")
+ .withArgument(
+ new ArgumentBuilder().withName(DISTANCE_MEASURE_OPTION)
+ .withDefault(SquaredEuclideanDistanceMeasure.class.getName())
+ .withMinimum(1).withMaximum(1).create())
+ .withDescription(
+ "The classname of the DistanceMeasure. Default is SquaredEuclidean");
+ }
+
+ /**
+ * Returns a default command line option for specification of sequential or
+ * parallel operation. Used by Canopy, FuzzyKmeans, Kmeans, MeanShift,
+ * Dirichlet
+ */
+ public static DefaultOptionBuilder methodOption() {
+ return new DefaultOptionBuilder()
+ .withLongName(METHOD_OPTION)
+ .withRequired(false)
+ .withShortName("xm")
+ .withArgument(
+ new ArgumentBuilder().withName(METHOD_OPTION)
+ .withDefault(MAPREDUCE_METHOD).withMinimum(1).withMaximum(1)
+ .create())
+ .withDescription(
+ "The execution method to use: sequential or mapreduce. Default is mapreduce");
+ }
+
+ /**
+ * Returns a default command line option for specification of T1. Used by
+ * Canopy, MeanShift
+ */
+ public static DefaultOptionBuilder t1Option() {
+ return new DefaultOptionBuilder()
+ .withLongName(T1_OPTION)
+ .withRequired(true)
+ .withArgument(
+ new ArgumentBuilder().withName(T1_OPTION).withMinimum(1)
+ .withMaximum(1).create()).withDescription("T1 threshold value")
+ .withShortName(T1_OPTION);
+ }
+
+ /**
+ * Returns a default command line option for specification of T2. Used by
+ * Canopy, MeanShift
+ */
+ public static DefaultOptionBuilder t2Option() {
+ return new DefaultOptionBuilder()
+ .withLongName(T2_OPTION)
+ .withRequired(true)
+ .withArgument(
+ new ArgumentBuilder().withName(T2_OPTION).withMinimum(1)
+ .withMaximum(1).create()).withDescription("T2 threshold value")
+ .withShortName(T2_OPTION);
+ }
+
+ /**
+ * Returns a default command line option for specification of T3 (Reducer T1).
+ * Used by Canopy
+ */
+ public static DefaultOptionBuilder t3Option() {
+ return new DefaultOptionBuilder()
+ .withLongName(T3_OPTION)
+ .withRequired(false)
+ .withArgument(
+ new ArgumentBuilder().withName(T3_OPTION).withMinimum(1)
+ .withMaximum(1).create())
+ .withDescription("T3 (Reducer T1) threshold value")
+ .withShortName(T3_OPTION);
+ }
+
+ /**
+ * Returns a default command line option for specification of T4 (Reducer T2).
+ * Used by Canopy
+ */
+ public static DefaultOptionBuilder t4Option() {
+ return new DefaultOptionBuilder()
+ .withLongName(T4_OPTION)
+ .withRequired(false)
+ .withArgument(
+ new ArgumentBuilder().withName(T4_OPTION).withMinimum(1)
+ .withMaximum(1).create())
+ .withDescription("T4 (Reducer T2) threshold value")
+ .withShortName(T4_OPTION);
+ }
+
+ /**
+ * @return a DefaultOptionBuilder for the clusterFilter option
+ */
+ public static DefaultOptionBuilder clusterFilterOption() {
+ return new DefaultOptionBuilder()
+ .withLongName(CLUSTER_FILTER_OPTION)
+ .withShortName("cf")
+ .withRequired(false)
+ .withArgument(
+ new ArgumentBuilder().withName(CLUSTER_FILTER_OPTION).withMinimum(1)
+ .withMaximum(1).create())
+ .withDescription("Cluster filter suppresses small canopies from mapper")
+ .withShortName(CLUSTER_FILTER_OPTION);
+ }
+
+ /**
+ * Returns a default command line option for specification of max number of
+ * iterations. Used by Dirichlet, FuzzyKmeans, Kmeans, LDA
+ */
+ public static DefaultOptionBuilder maxIterationsOption() {
+ // default value used by LDA which overrides withRequired(false)
+ return new DefaultOptionBuilder()
+ .withLongName(MAX_ITERATIONS_OPTION)
+ .withRequired(true)
+ .withShortName("x")
+ .withArgument(
+ new ArgumentBuilder().withName(MAX_ITERATIONS_OPTION)
+ .withDefault("-1").withMinimum(1).withMaximum(1).create())
+ .withDescription("The maximum number of iterations.");
+ }
+
+ /**
+ * Returns a default command line option for specification of numbers of
+ * clusters to create. Used by Dirichlet, FuzzyKmeans, Kmeans
+ */
+ public static DefaultOptionBuilder numClustersOption() {
+ return new DefaultOptionBuilder()
+ .withLongName(NUM_CLUSTERS_OPTION)
+ .withRequired(false)
+ .withArgument(
+ new ArgumentBuilder().withName("k").withMinimum(1).withMaximum(1)
+ .create()).withDescription("The number of clusters to create")
+ .withShortName("k");
+ }
+
+ public static DefaultOptionBuilder useSetRandomSeedOption() {
+ return new DefaultOptionBuilder()
+ .withLongName(RANDOM_SEED)
+ .withRequired(false)
+ .withArgument(new ArgumentBuilder().withName(RANDOM_SEED).create())
+ .withDescription("Seed to initaize Random Number Generator with")
+ .withShortName("rs");
+ }
+
+ /**
+ * Returns a default command line option for convergence delta specification.
+ * Used by FuzzyKmeans, Kmeans, MeanShift
+ */
+ public static DefaultOptionBuilder convergenceOption() {
+ return new DefaultOptionBuilder()
+ .withLongName(CONVERGENCE_DELTA_OPTION)
+ .withRequired(false)
+ .withShortName("cd")
+ .withArgument(
+ new ArgumentBuilder().withName(CONVERGENCE_DELTA_OPTION)
+ .withDefault("0.5").withMinimum(1).withMaximum(1).create())
+ .withDescription("The convergence delta value. Default is 0.5");
+ }
+
+ /**
+ * Returns a default command line option for specifying the max number of
+ * reducers. Used by Dirichlet, FuzzyKmeans, Kmeans and LDA
+ *
+ * @deprecated
+ */
+ @Deprecated
+ public static DefaultOptionBuilder numReducersOption() {
+ return new DefaultOptionBuilder()
+ .withLongName(MAX_REDUCERS_OPTION)
+ .withRequired(false)
+ .withShortName("r")
+ .withArgument(
+ new ArgumentBuilder().withName(MAX_REDUCERS_OPTION)
+ .withDefault("2").withMinimum(1).withMaximum(1).create())
+ .withDescription("The number of reduce tasks. Defaults to 2");
+ }
+
+ /**
+ * Returns a default command line option for clustering specification. Used by
+ * all clustering except LDA
+ */
+ public static DefaultOptionBuilder clusteringOption() {
+ return new DefaultOptionBuilder()
+ .withLongName(CLUSTERING_OPTION)
+ .withRequired(false)
+ .withDescription(
+ "If present, run clustering after the iterations have taken place")
+ .withShortName("cl");
+ }
+
+ /**
+ * Returns a default command line option for specifying a Lucene analyzer class
+ * @return {@link DefaultOptionBuilder}
+ */
+ public static DefaultOptionBuilder analyzerOption() {
+ return new DefaultOptionBuilder()
+ .withLongName(ANALYZER_NAME_OPTION)
+ .withRequired(false)
+ .withDescription("If present, the name of a Lucene analyzer class to use")
+ .withArgument(new ArgumentBuilder().withName(ANALYZER_NAME_OPTION).withDefault(StandardAnalyzer.class.getName())
+ .withMinimum(1).withMaximum(1).create())
+ .withShortName("an");
+ }
+
+
+ /**
+ * Returns a default command line option for specifying the emitMostLikely
+ * flag. Used by Dirichlet and FuzzyKmeans
+ */
+ public static DefaultOptionBuilder emitMostLikelyOption() {
+ return new DefaultOptionBuilder()
+ .withLongName(EMIT_MOST_LIKELY_OPTION)
+ .withRequired(false)
+ .withShortName("e")
+ .withArgument(
+ new ArgumentBuilder().withName(EMIT_MOST_LIKELY_OPTION)
+ .withDefault("true").withMinimum(1).withMaximum(1).create())
+ .withDescription(
+ "True if clustering should emit the most likely point only, "
+ + "false for threshold clustering. Default is true");
+ }
+
+ /**
+ * Returns a default command line option for specifying the clustering
+ * threshold value. Used by Dirichlet and FuzzyKmeans
+ */
+ public static DefaultOptionBuilder thresholdOption() {
+ return new DefaultOptionBuilder()
+ .withLongName(THRESHOLD_OPTION)
+ .withRequired(false)
+ .withShortName("t")
+ .withArgument(
+ new ArgumentBuilder().withName(THRESHOLD_OPTION).withDefault("0")
+ .withMinimum(1).withMaximum(1).create())
+ .withDescription(
+ "The pdf threshold used for cluster determination. Default is 0");
+ }
+
+ public static DefaultOptionBuilder kernelProfileOption() {
+ return new DefaultOptionBuilder()
+ .withLongName(KERNEL_PROFILE_OPTION)
+ .withRequired(false)
+ .withShortName("kp")
+ .withArgument(
+ new ArgumentBuilder()
+ .withName(KERNEL_PROFILE_OPTION)
+ .withDefault(TriangularKernelProfile.class.getName())
+ .withMinimum(1).withMaximum(1).create())
+ .withDescription(
+ "The classname of the IKernelProfile. Default is TriangularKernelProfile");
+ }
+
+ /**
+ * Returns a default command line option for specification of OUTLIER THRESHOLD value. Used for
+ * Cluster Classification.
+ */
+ public static DefaultOptionBuilder outlierThresholdOption() {
+ return new DefaultOptionBuilder()
+ .withLongName(OUTLIER_THRESHOLD)
+ .withRequired(false)
+ .withArgument(
+ new ArgumentBuilder().withName(OUTLIER_THRESHOLD).withMinimum(1)
+ .withMaximum(1).create()).withDescription("Outlier threshold value")
+ .withShortName(OUTLIER_THRESHOLD);
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/ChebyshevDistanceMeasure.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/ChebyshevDistanceMeasure.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/ChebyshevDistanceMeasure.java
new file mode 100644
index 0000000..61aa9a5
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/ChebyshevDistanceMeasure.java
@@ -0,0 +1,63 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common.distance;
+
+import java.util.Collection;
+import java.util.Collections;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.mahout.common.parameters.Parameter;
+import org.apache.mahout.math.CardinalityException;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.function.Functions;
+
+/**
+ * This class implements a "Chebyshev distance" metric by finding the maximum difference
+ * between each coordinate. Also 'chessboard distance' due to the moves a king can make.
+ */
+public class ChebyshevDistanceMeasure implements DistanceMeasure {
+
+ @Override
+ public void configure(Configuration job) {
+ // nothing to do
+ }
+
+ @Override
+ public Collection<Parameter<?>> getParameters() {
+ return Collections.emptyList();
+ }
+
+ @Override
+ public void createParameters(String prefix, Configuration jobConf) {
+ // nothing to do
+ }
+
+ @Override
+ public double distance(Vector v1, Vector v2) {
+ if (v1.size() != v2.size()) {
+ throw new CardinalityException(v1.size(), v2.size());
+ }
+ return v1.aggregate(v2, Functions.MAX_ABS, Functions.MINUS);
+ }
+
+ @Override
+ public double distance(double centroidLengthSquare, Vector centroid, Vector v) {
+ return distance(centroid, v); // TODO
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/CosineDistanceMeasure.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/CosineDistanceMeasure.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/CosineDistanceMeasure.java
new file mode 100644
index 0000000..37265eb
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/CosineDistanceMeasure.java
@@ -0,0 +1,119 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common.distance;
+
+import java.util.Collection;
+import java.util.Collections;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.mahout.common.parameters.Parameter;
+import org.apache.mahout.math.CardinalityException;
+import org.apache.mahout.math.Vector;
+
+/**
+ * This class implements a cosine distance metric by dividing the dot product of two vectors by the product of their
+ * lengths. That gives the cosine of the angle between the two vectors. To convert this to a usable distance,
+ * 1-cos(angle) is what is actually returned.
+ */
+public class CosineDistanceMeasure implements DistanceMeasure {
+
+ @Override
+ public void configure(Configuration job) {
+ // nothing to do
+ }
+
+ @Override
+ public Collection<Parameter<?>> getParameters() {
+ return Collections.emptyList();
+ }
+
+ @Override
+ public void createParameters(String prefix, Configuration jobConf) {
+ // nothing to do
+ }
+
+ public static double distance(double[] p1, double[] p2) {
+ double dotProduct = 0.0;
+ double lengthSquaredp1 = 0.0;
+ double lengthSquaredp2 = 0.0;
+ for (int i = 0; i < p1.length; i++) {
+ lengthSquaredp1 += p1[i] * p1[i];
+ lengthSquaredp2 += p2[i] * p2[i];
+ dotProduct += p1[i] * p2[i];
+ }
+ double denominator = Math.sqrt(lengthSquaredp1) * Math.sqrt(lengthSquaredp2);
+
+ // correct for floating-point rounding errors
+ if (denominator < dotProduct) {
+ denominator = dotProduct;
+ }
+
+ // correct for zero-vector corner case
+ if (denominator == 0 && dotProduct == 0) {
+ return 0;
+ }
+
+ return 1.0 - dotProduct / denominator;
+ }
+
+ @Override
+ public double distance(Vector v1, Vector v2) {
+ if (v1.size() != v2.size()) {
+ throw new CardinalityException(v1.size(), v2.size());
+ }
+ double lengthSquaredv1 = v1.getLengthSquared();
+ double lengthSquaredv2 = v2.getLengthSquared();
+
+ double dotProduct = v2.dot(v1);
+ double denominator = Math.sqrt(lengthSquaredv1) * Math.sqrt(lengthSquaredv2);
+
+ // correct for floating-point rounding errors
+ if (denominator < dotProduct) {
+ denominator = dotProduct;
+ }
+
+ // correct for zero-vector corner case
+ if (denominator == 0 && dotProduct == 0) {
+ return 0;
+ }
+
+ return 1.0 - dotProduct / denominator;
+ }
+
+ @Override
+ public double distance(double centroidLengthSquare, Vector centroid, Vector v) {
+
+ double lengthSquaredv = v.getLengthSquared();
+
+ double dotProduct = v.dot(centroid);
+ double denominator = Math.sqrt(centroidLengthSquare) * Math.sqrt(lengthSquaredv);
+
+ // correct for floating-point rounding errors
+ if (denominator < dotProduct) {
+ denominator = dotProduct;
+ }
+
+ // correct for zero-vector corner case
+ if (denominator == 0 && dotProduct == 0) {
+ return 0;
+ }
+
+ return 1.0 - dotProduct / denominator;
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/DistanceMeasure.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/DistanceMeasure.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/DistanceMeasure.java
new file mode 100644
index 0000000..696e79c
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/DistanceMeasure.java
@@ -0,0 +1,48 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common.distance;
+
+import org.apache.mahout.common.parameters.Parametered;
+import org.apache.mahout.math.Vector;
+
+/** This interface is used for objects which can determine a distance metric between two points */
+public interface DistanceMeasure extends Parametered {
+
+ /**
+ * Returns the distance metric applied to the arguments
+ *
+ * @param v1
+ * a Vector defining a multidimensional point in some feature space
+ * @param v2
+ * a Vector defining a multidimensional point in some feature space
+ * @return a scalar doubles of the distance
+ */
+ double distance(Vector v1, Vector v2);
+
+ /**
+ * Optimized version of distance metric for sparse vectors. This distance computation requires operations
+ * proportional to the number of non-zero elements in the vector instead of the cardinality of the vector.
+ *
+ * @param centroidLengthSquare
+ * Square of the length of centroid
+ * @param centroid
+ * Centroid vector
+ */
+ double distance(double centroidLengthSquare, Vector centroid, Vector v);
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/EuclideanDistanceMeasure.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/EuclideanDistanceMeasure.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/EuclideanDistanceMeasure.java
new file mode 100644
index 0000000..665678d
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/EuclideanDistanceMeasure.java
@@ -0,0 +1,41 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common.distance;
+
+import org.apache.mahout.math.Vector;
+
+/**
+ * This class implements a Euclidean distance metric by summing the square root of the squared differences
+ * between each coordinate.
+ * <p/>
+ * If you don't care about the true distance and only need the values for comparison, then the base class,
+ * {@link SquaredEuclideanDistanceMeasure}, will be faster since it doesn't do the actual square root of the
+ * squared differences.
+ */
+public class EuclideanDistanceMeasure extends SquaredEuclideanDistanceMeasure {
+
+ @Override
+ public double distance(Vector v1, Vector v2) {
+ return Math.sqrt(super.distance(v1, v2));
+ }
+
+ @Override
+ public double distance(double centroidLengthSquare, Vector centroid, Vector v) {
+ return Math.sqrt(super.distance(centroidLengthSquare, centroid, v));
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/MahalanobisDistanceMeasure.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/MahalanobisDistanceMeasure.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/MahalanobisDistanceMeasure.java
new file mode 100644
index 0000000..17ee714
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/MahalanobisDistanceMeasure.java
@@ -0,0 +1,197 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common.distance;
+
+import java.io.DataInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+
+import com.google.common.base.Preconditions;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.mahout.common.ClassUtils;
+import org.apache.mahout.common.parameters.ClassParameter;
+import org.apache.mahout.common.parameters.Parameter;
+import org.apache.mahout.common.parameters.PathParameter;
+import org.apache.mahout.math.Algebra;
+import org.apache.mahout.math.CardinalityException;
+import org.apache.mahout.math.DenseMatrix;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.Matrix;
+import org.apache.mahout.math.MatrixWritable;
+import org.apache.mahout.math.SingularValueDecomposition;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+
+//See http://en.wikipedia.org/wiki/Mahalanobis_distance for details
+public class MahalanobisDistanceMeasure implements DistanceMeasure {
+
+ private Matrix inverseCovarianceMatrix;
+ private Vector meanVector;
+
+ private ClassParameter vectorClass;
+ private ClassParameter matrixClass;
+ private List<Parameter<?>> parameters;
+ private Parameter<Path> inverseCovarianceFile;
+ private Parameter<Path> meanVectorFile;
+
+ /*public MahalanobisDistanceMeasure(Vector meanVector,Matrix inputMatrix, boolean inversionNeeded)
+ {
+ this.meanVector=meanVector;
+ if (inversionNeeded)
+ setCovarianceMatrix(inputMatrix);
+ else
+ setInverseCovarianceMatrix(inputMatrix);
+ }*/
+
+ @Override
+ public void configure(Configuration jobConf) {
+ if (parameters == null) {
+ ParameteredGeneralizations.configureParameters(this, jobConf);
+ }
+ try {
+ if (inverseCovarianceFile.get() != null) {
+ FileSystem fs = FileSystem.get(inverseCovarianceFile.get().toUri(), jobConf);
+ MatrixWritable inverseCovarianceMatrix =
+ ClassUtils.instantiateAs((Class<? extends MatrixWritable>) matrixClass.get(), MatrixWritable.class);
+ if (!fs.exists(inverseCovarianceFile.get())) {
+ throw new FileNotFoundException(inverseCovarianceFile.get().toString());
+ }
+ try (DataInputStream in = fs.open(inverseCovarianceFile.get())){
+ inverseCovarianceMatrix.readFields(in);
+ }
+ this.inverseCovarianceMatrix = inverseCovarianceMatrix.get();
+ Preconditions.checkArgument(this.inverseCovarianceMatrix != null, "inverseCovarianceMatrix not initialized");
+ }
+
+ if (meanVectorFile.get() != null) {
+ FileSystem fs = FileSystem.get(meanVectorFile.get().toUri(), jobConf);
+ VectorWritable meanVector =
+ ClassUtils.instantiateAs((Class<? extends VectorWritable>) vectorClass.get(), VectorWritable.class);
+ if (!fs.exists(meanVectorFile.get())) {
+ throw new FileNotFoundException(meanVectorFile.get().toString());
+ }
+ try (DataInputStream in = fs.open(meanVectorFile.get())){
+ meanVector.readFields(in);
+ }
+ this.meanVector = meanVector.get();
+ Preconditions.checkArgument(this.meanVector != null, "meanVector not initialized");
+ }
+
+ } catch (IOException e) {
+ throw new IllegalStateException(e);
+ }
+ }
+
+ @Override
+ public Collection<Parameter<?>> getParameters() {
+ return parameters;
+ }
+
+ @Override
+ public void createParameters(String prefix, Configuration jobConf) {
+ parameters = new ArrayList<>();
+ inverseCovarianceFile = new PathParameter(prefix, "inverseCovarianceFile", jobConf, null,
+ "Path on DFS to a file containing the inverse covariance matrix.");
+ parameters.add(inverseCovarianceFile);
+
+ matrixClass = new ClassParameter(prefix, "maxtrixClass", jobConf, DenseMatrix.class,
+ "Class<Matix> file specified in parameter inverseCovarianceFile has been serialized with.");
+ parameters.add(matrixClass);
+
+ meanVectorFile = new PathParameter(prefix, "meanVectorFile", jobConf, null,
+ "Path on DFS to a file containing the mean Vector.");
+ parameters.add(meanVectorFile);
+
+ vectorClass = new ClassParameter(prefix, "vectorClass", jobConf, DenseVector.class,
+ "Class file specified in parameter meanVectorFile has been serialized with.");
+ parameters.add(vectorClass);
+ }
+
+ /**
+ * @param v The vector to compute the distance to
+ * @return Mahalanobis distance of a multivariate vector
+ */
+ public double distance(Vector v) {
+ return Math.sqrt(v.minus(meanVector).dot(Algebra.mult(inverseCovarianceMatrix, v.minus(meanVector))));
+ }
+
+ @Override
+ public double distance(Vector v1, Vector v2) {
+ if (v1.size() != v2.size()) {
+ throw new CardinalityException(v1.size(), v2.size());
+ }
+ return Math.sqrt(v1.minus(v2).dot(Algebra.mult(inverseCovarianceMatrix, v1.minus(v2))));
+ }
+
+ @Override
+ public double distance(double centroidLengthSquare, Vector centroid, Vector v) {
+ return distance(centroid, v); // TODO
+ }
+
+ public void setInverseCovarianceMatrix(Matrix inverseCovarianceMatrix) {
+ Preconditions.checkArgument(inverseCovarianceMatrix != null, "inverseCovarianceMatrix not initialized");
+ this.inverseCovarianceMatrix = inverseCovarianceMatrix;
+ }
+
+
+ /**
+ * Computes the inverse covariance from the input covariance matrix given in input.
+ *
+ * @param m A covariance matrix.
+ * @throws IllegalArgumentException if <tt>eigen values equal to 0 found</tt>.
+ */
+ public void setCovarianceMatrix(Matrix m) {
+ if (m.numRows() != m.numCols()) {
+ throw new CardinalityException(m.numRows(), m.numCols());
+ }
+ // See http://www.mlahanas.de/Math/svd.htm for details,
+ // which specifically details the case of covariance matrix inversion
+ // Complexity: O(min(nm2,mn2))
+ SingularValueDecomposition svd = new SingularValueDecomposition(m);
+ Matrix sInv = svd.getS();
+ // Inverse Diagonal Elems
+ for (int i = 0; i < sInv.numRows(); i++) {
+ double diagElem = sInv.get(i, i);
+ if (diagElem > 0.0) {
+ sInv.set(i, i, 1 / diagElem);
+ } else {
+ throw new IllegalStateException("Eigen Value equals to 0 found.");
+ }
+ }
+ inverseCovarianceMatrix = svd.getU().times(sInv.times(svd.getU().transpose()));
+ Preconditions.checkArgument(inverseCovarianceMatrix != null, "inverseCovarianceMatrix not initialized");
+ }
+
+ public Matrix getInverseCovarianceMatrix() {
+ return inverseCovarianceMatrix;
+ }
+
+ public void setMeanVector(Vector meanVector) {
+ Preconditions.checkArgument(meanVector != null, "meanVector not initialized");
+ this.meanVector = meanVector;
+ }
+
+ public Vector getMeanVector() {
+ return meanVector;
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/ManhattanDistanceMeasure.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/ManhattanDistanceMeasure.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/ManhattanDistanceMeasure.java
new file mode 100644
index 0000000..5c32fcf
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/ManhattanDistanceMeasure.java
@@ -0,0 +1,70 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common.distance;
+
+import java.util.Collection;
+import java.util.Collections;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.mahout.common.parameters.Parameter;
+import org.apache.mahout.math.CardinalityException;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.function.Functions;
+
+/**
+ * This class implements a "manhattan distance" metric by summing the absolute values of the difference
+ * between each coordinate
+ */
+public class ManhattanDistanceMeasure implements DistanceMeasure {
+
+ public static double distance(double[] p1, double[] p2) {
+ double result = 0.0;
+ for (int i = 0; i < p1.length; i++) {
+ result += Math.abs(p2[i] - p1[i]);
+ }
+ return result;
+ }
+
+ @Override
+ public void configure(Configuration job) {
+ // nothing to do
+ }
+
+ @Override
+ public Collection<Parameter<?>> getParameters() {
+ return Collections.emptyList();
+ }
+
+ @Override
+ public void createParameters(String prefix, Configuration jobConf) {
+ // nothing to do
+ }
+
+ @Override
+ public double distance(Vector v1, Vector v2) {
+ if (v1.size() != v2.size()) {
+ throw new CardinalityException(v1.size(), v2.size());
+ }
+ return v1.aggregate(v2, Functions.PLUS, Functions.MINUS_ABS);
+ }
+
+ @Override
+ public double distance(double centroidLengthSquare, Vector centroid, Vector v) {
+ return distance(centroid, v); // TODO
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/MinkowskiDistanceMeasure.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/MinkowskiDistanceMeasure.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/MinkowskiDistanceMeasure.java
new file mode 100644
index 0000000..c3a48cb
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/MinkowskiDistanceMeasure.java
@@ -0,0 +1,93 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common.distance;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.mahout.common.parameters.DoubleParameter;
+import org.apache.mahout.common.parameters.Parameter;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.function.Functions;
+
+/**
+ * Implement Minkowski distance, a real-valued generalization of the
+ * integral L(n) distances: Manhattan = L1, Euclidean = L2.
+ * For high numbers of dimensions, very high exponents give more useful distances.
+ *
+ * Note: Math.pow is clever about integer-valued doubles.
+ **/
+public class MinkowskiDistanceMeasure implements DistanceMeasure {
+
+ private static final double EXPONENT = 3.0;
+
+ private List<Parameter<?>> parameters;
+ private double exponent = EXPONENT;
+
+ public MinkowskiDistanceMeasure() {
+ }
+
+ public MinkowskiDistanceMeasure(double exponent) {
+ this.exponent = exponent;
+ }
+
+ @Override
+ public void createParameters(String prefix, Configuration conf) {
+ parameters = new ArrayList<>();
+ Parameter<?> param =
+ new DoubleParameter(prefix, "exponent", conf, EXPONENT, "Exponent for Fractional Lagrange distance");
+ parameters.add(param);
+ }
+
+ @Override
+ public Collection<Parameter<?>> getParameters() {
+ return parameters;
+ }
+
+ @Override
+ public void configure(Configuration jobConf) {
+ if (parameters == null) {
+ ParameteredGeneralizations.configureParameters(this, jobConf);
+ }
+ }
+
+ public double getExponent() {
+ return exponent;
+ }
+
+ public void setExponent(double exponent) {
+ this.exponent = exponent;
+ }
+
+ /**
+ * Math.pow is clever about integer-valued doubles
+ */
+ @Override
+ public double distance(Vector v1, Vector v2) {
+ return Math.pow(v1.aggregate(v2, Functions.PLUS, Functions.minusAbsPow(exponent)), 1.0 / exponent);
+ }
+
+ // TODO: how?
+ @Override
+ public double distance(double centroidLengthSquare, Vector centroid, Vector v) {
+ return distance(centroid, v); // TODO - can this use centroidLengthSquare somehow?
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/SquaredEuclideanDistanceMeasure.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/SquaredEuclideanDistanceMeasure.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/SquaredEuclideanDistanceMeasure.java
new file mode 100644
index 0000000..66da121
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/SquaredEuclideanDistanceMeasure.java
@@ -0,0 +1,59 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common.distance;
+
+import java.util.Collection;
+import java.util.Collections;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.mahout.common.parameters.Parameter;
+import org.apache.mahout.math.Vector;
+
+/**
+ * Like {@link EuclideanDistanceMeasure} but it does not take the square root.
+ * <p/>
+ * Thus, it is not actually the Euclidean Distance, but it is saves on computation when you only need the
+ * distance for comparison and don't care about the actual value as a distance.
+ */
+public class SquaredEuclideanDistanceMeasure implements DistanceMeasure {
+
+ @Override
+ public void configure(Configuration job) {
+ // nothing to do
+ }
+
+ @Override
+ public Collection<Parameter<?>> getParameters() {
+ return Collections.emptyList();
+ }
+
+ @Override
+ public void createParameters(String prefix, Configuration jobConf) {
+ // nothing to do
+ }
+
+ @Override
+ public double distance(Vector v1, Vector v2) {
+ return v2.getDistanceSquared(v1);
+ }
+
+ @Override
+ public double distance(double centroidLengthSquare, Vector centroid, Vector v) {
+ return centroidLengthSquare - 2 * v.dot(centroid) + v.getLengthSquared();
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/TanimotoDistanceMeasure.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/TanimotoDistanceMeasure.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/TanimotoDistanceMeasure.java
new file mode 100644
index 0000000..cfeb119
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/TanimotoDistanceMeasure.java
@@ -0,0 +1,69 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common.distance;
+
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.function.Functions;
+
+/**
+ * Tanimoto coefficient implementation.
+ *
+ * http://en.wikipedia.org/wiki/Jaccard_index
+ */
+public class TanimotoDistanceMeasure extends WeightedDistanceMeasure {
+
+ /**
+ * Calculates the distance between two vectors.
+ *
+ * The coefficient (a measure of similarity) is: T(a, b) = a.b / (|a|^2 + |b|^2 - a.b)
+ *
+ * The distance d(a,b) = 1 - T(a,b)
+ *
+ * @return 0 for perfect match, > 0 for greater distance
+ */
+ @Override
+ public double distance(Vector a, Vector b) {
+ double ab;
+ double denominator;
+ if (getWeights() != null) {
+ ab = a.times(b).aggregate(getWeights(), Functions.PLUS, Functions.MULT);
+ denominator = a.aggregate(getWeights(), Functions.PLUS, Functions.MULT_SQUARE_LEFT)
+ + b.aggregate(getWeights(), Functions.PLUS, Functions.MULT_SQUARE_LEFT)
+ - ab;
+ } else {
+ ab = b.dot(a); // b is SequentialAccess
+ denominator = a.getLengthSquared() + b.getLengthSquared() - ab;
+ }
+
+ if (denominator < ab) { // correct for fp round-off: distance >= 0
+ denominator = ab;
+ }
+ if (denominator > 0) {
+ // denominator == 0 only when dot(a,a) == dot(b,b) == dot(a,b) == 0
+ return 1.0 - ab / denominator;
+ } else {
+ return 0.0;
+ }
+ }
+
+ @Override
+ public double distance(double centroidLengthSquare, Vector centroid, Vector v) {
+ return distance(centroid, v); // TODO
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/WeightedDistanceMeasure.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/WeightedDistanceMeasure.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/WeightedDistanceMeasure.java
new file mode 100644
index 0000000..1acbe86
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/WeightedDistanceMeasure.java
@@ -0,0 +1,93 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common.distance;
+
+import java.io.DataInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.mahout.common.ClassUtils;
+import org.apache.mahout.common.parameters.ClassParameter;
+import org.apache.mahout.common.parameters.Parameter;
+import org.apache.mahout.common.parameters.PathParameter;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+
+/** Abstract implementation of DistanceMeasure with support for weights. */
+public abstract class WeightedDistanceMeasure implements DistanceMeasure {
+
+ private List<Parameter<?>> parameters;
+ private Parameter<Path> weightsFile;
+ private ClassParameter vectorClass;
+ private Vector weights;
+
+ @Override
+ public void createParameters(String prefix, Configuration jobConf) {
+ parameters = new ArrayList<>();
+ weightsFile = new PathParameter(prefix, "weightsFile", jobConf, null,
+ "Path on DFS to a file containing the weights.");
+ parameters.add(weightsFile);
+ vectorClass = new ClassParameter(prefix, "vectorClass", jobConf, DenseVector.class,
+ "Class<Vector> file specified in parameter weightsFile has been serialized with.");
+ parameters.add(vectorClass);
+ }
+
+ @Override
+ public Collection<Parameter<?>> getParameters() {
+ return parameters;
+ }
+
+ @Override
+ public void configure(Configuration jobConf) {
+ if (parameters == null) {
+ ParameteredGeneralizations.configureParameters(this, jobConf);
+ }
+ try {
+ if (weightsFile.get() != null) {
+ FileSystem fs = FileSystem.get(weightsFile.get().toUri(), jobConf);
+ VectorWritable weights =
+ ClassUtils.instantiateAs((Class<? extends VectorWritable>) vectorClass.get(), VectorWritable.class);
+ if (!fs.exists(weightsFile.get())) {
+ throw new FileNotFoundException(weightsFile.get().toString());
+ }
+ try (DataInputStream in = fs.open(weightsFile.get())){
+ weights.readFields(in);
+ }
+ this.weights = weights.get();
+ }
+ } catch (IOException e) {
+ throw new IllegalStateException(e);
+ }
+ }
+
+ public Vector getWeights() {
+ return weights;
+ }
+
+ public void setWeights(Vector weights) {
+ this.weights = weights;
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/WeightedEuclideanDistanceMeasure.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/WeightedEuclideanDistanceMeasure.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/WeightedEuclideanDistanceMeasure.java
new file mode 100644
index 0000000..4c78d9f
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/WeightedEuclideanDistanceMeasure.java
@@ -0,0 +1,51 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common.distance;
+
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.Vector.Element;
+
+/**
+ * This class implements a Euclidean distance metric by summing the square root of the squared differences
+ * between each coordinate, optionally adding weights.
+ */
+public class WeightedEuclideanDistanceMeasure extends WeightedDistanceMeasure {
+
+ @Override
+ public double distance(Vector p1, Vector p2) {
+ double result = 0;
+ Vector res = p2.minus(p1);
+ Vector theWeights = getWeights();
+ if (theWeights == null) {
+ for (Element elt : res.nonZeroes()) {
+ result += elt.get() * elt.get();
+ }
+ } else {
+ for (Element elt : res.nonZeroes()) {
+ result += elt.get() * elt.get() * theWeights.get(elt.index());
+ }
+ }
+ return Math.sqrt(result);
+ }
+
+ @Override
+ public double distance(double centroidLengthSquare, Vector centroid, Vector v) {
+ return distance(centroid, v); // TODO
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/WeightedManhattanDistanceMeasure.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/WeightedManhattanDistanceMeasure.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/WeightedManhattanDistanceMeasure.java
new file mode 100644
index 0000000..2c280e2
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/distance/WeightedManhattanDistanceMeasure.java
@@ -0,0 +1,53 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common.distance;
+
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.Vector.Element;
+
+/**
+ * This class implements a "Manhattan distance" metric by summing the absolute values of the difference
+ * between each coordinate, optionally with weights.
+ */
+public class WeightedManhattanDistanceMeasure extends WeightedDistanceMeasure {
+
+ @Override
+ public double distance(Vector p1, Vector p2) {
+ double result = 0;
+
+ Vector res = p2.minus(p1);
+ if (getWeights() == null) {
+ for (Element elt : res.nonZeroes()) {
+ result += Math.abs(elt.get());
+ }
+
+ } else {
+ for (Element elt : res.nonZeroes()) {
+ result += Math.abs(elt.get() * getWeights().get(elt.index()));
+ }
+ }
+
+ return result;
+ }
+
+ @Override
+ public double distance(double centroidLengthSquare, Vector centroid, Vector v) {
+ return distance(centroid, v); // TODO
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/CopyConstructorIterator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/CopyConstructorIterator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/CopyConstructorIterator.java
new file mode 100644
index 0000000..73cc821
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/CopyConstructorIterator.java
@@ -0,0 +1,64 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common.iterator;
+
+import java.lang.reflect.Constructor;
+import java.lang.reflect.InvocationTargetException;
+import java.util.Iterator;
+
+import com.google.common.base.Function;
+import com.google.common.collect.ForwardingIterator;
+import com.google.common.collect.Iterators;
+
+/**
+ * An iterator that copies the values in an underlying iterator by finding an appropriate copy constructor.
+ */
+public final class CopyConstructorIterator<T> extends ForwardingIterator<T> {
+
+ private final Iterator<T> delegate;
+ private Constructor<T> constructor;
+
+ public CopyConstructorIterator(Iterator<? extends T> copyFrom) {
+ this.delegate = Iterators.transform(
+ copyFrom,
+ new Function<T,T>() {
+ @Override
+ public T apply(T from) {
+ if (constructor == null) {
+ Class<T> elementClass = (Class<T>) from.getClass();
+ try {
+ constructor = elementClass.getConstructor(elementClass);
+ } catch (NoSuchMethodException e) {
+ throw new IllegalStateException(e);
+ }
+ }
+ try {
+ return constructor.newInstance(from);
+ } catch (InstantiationException | IllegalAccessException | InvocationTargetException e) {
+ throw new IllegalStateException(e);
+ }
+ }
+ });
+ }
+
+ @Override
+ protected Iterator<T> delegate() {
+ return delegate;
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/CountingIterator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/CountingIterator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/CountingIterator.java
new file mode 100644
index 0000000..658c1f1
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/CountingIterator.java
@@ -0,0 +1,43 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common.iterator;
+
+import com.google.common.collect.AbstractIterator;
+
+/**
+ * Iterates over the integers from 0 through {@code to-1}.
+ */
+public final class CountingIterator extends AbstractIterator<Integer> {
+
+ private int count;
+ private final int to;
+
+ public CountingIterator(int to) {
+ this.to = to;
+ }
+
+ @Override
+ protected Integer computeNext() {
+ if (count < to) {
+ return count++;
+ } else {
+ return endOfData();
+ }
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/FileLineIterable.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/FileLineIterable.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/FileLineIterable.java
new file mode 100644
index 0000000..cfc18d6
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/FileLineIterable.java
@@ -0,0 +1,88 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common.iterator;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.Charset;
+import java.util.Iterator;
+
+import com.google.common.base.Charsets;
+
+/**
+ * Iterable representing the lines of a text file. It can produce an {@link Iterator} over those lines. This
+ * assumes the text file's lines are delimited in a manner consistent with how {@link java.io.BufferedReader}
+ * defines lines.
+ *
+ */
+public final class FileLineIterable implements Iterable<String> {
+
+ private final InputStream is;
+ private final Charset encoding;
+ private final boolean skipFirstLine;
+ private final String origFilename;
+
+ /** Creates a {@link FileLineIterable} over a given file, assuming a UTF-8 encoding. */
+ public FileLineIterable(File file) throws IOException {
+ this(file, Charsets.UTF_8, false);
+ }
+
+ /** Creates a {@link FileLineIterable} over a given file, assuming a UTF-8 encoding. */
+ public FileLineIterable(File file, boolean skipFirstLine) throws IOException {
+ this(file, Charsets.UTF_8, skipFirstLine);
+ }
+
+ /** Creates a {@link FileLineIterable} over a given file, using the given encoding. */
+ public FileLineIterable(File file, Charset encoding, boolean skipFirstLine) throws IOException {
+ this(FileLineIterator.getFileInputStream(file), encoding, skipFirstLine);
+ }
+
+ public FileLineIterable(InputStream is) {
+ this(is, Charsets.UTF_8, false);
+ }
+
+ public FileLineIterable(InputStream is, boolean skipFirstLine) {
+ this(is, Charsets.UTF_8, skipFirstLine);
+ }
+
+ public FileLineIterable(InputStream is, Charset encoding, boolean skipFirstLine) {
+ this.is = is;
+ this.encoding = encoding;
+ this.skipFirstLine = skipFirstLine;
+ this.origFilename = "";
+ }
+
+ public FileLineIterable(InputStream is, Charset encoding, boolean skipFirstLine, String filename) {
+ this.is = is;
+ this.encoding = encoding;
+ this.skipFirstLine = skipFirstLine;
+ this.origFilename = filename;
+ }
+
+
+ @Override
+ public Iterator<String> iterator() {
+ try {
+ return new FileLineIterator(is, encoding, skipFirstLine, this.origFilename);
+ } catch (IOException ioe) {
+ throw new IllegalStateException(ioe);
+ }
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/FileLineIterator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/FileLineIterator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/FileLineIterator.java
new file mode 100644
index 0000000..b7cc51e
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/FileLineIterator.java
@@ -0,0 +1,167 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common.iterator;
+
+import java.io.BufferedReader;
+import java.io.Closeable;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.nio.charset.Charset;
+import java.util.zip.GZIPInputStream;
+import java.util.zip.ZipInputStream;
+
+import com.google.common.base.Charsets;
+import com.google.common.collect.AbstractIterator;
+import com.google.common.io.Closeables;
+import com.google.common.io.Files;
+import org.apache.mahout.cf.taste.impl.common.SkippingIterator;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Iterates over the lines of a text file. This assumes the text file's lines are delimited in a manner
+ * consistent with how {@link BufferedReader} defines lines.
+ * <p/>
+ * This class will uncompress files that end in .zip or .gz accordingly, too.
+ */
+public final class FileLineIterator extends AbstractIterator<String> implements SkippingIterator<String>, Closeable {
+
+ private final BufferedReader reader;
+
+ private static final Logger log = LoggerFactory.getLogger(FileLineIterator.class);
+
+ /**
+ * Creates a {@link FileLineIterator} over a given file, assuming a UTF-8 encoding.
+ *
+ * @throws java.io.FileNotFoundException if the file does not exist
+ * @throws IOException
+ * if the file cannot be read
+ */
+ public FileLineIterator(File file) throws IOException {
+ this(file, Charsets.UTF_8, false);
+ }
+
+ /**
+ * Creates a {@link FileLineIterator} over a given file, assuming a UTF-8 encoding.
+ *
+ * @throws java.io.FileNotFoundException if the file does not exist
+ * @throws IOException if the file cannot be read
+ */
+ public FileLineIterator(File file, boolean skipFirstLine) throws IOException {
+ this(file, Charsets.UTF_8, skipFirstLine);
+ }
+
+ /**
+ * Creates a {@link FileLineIterator} over a given file, using the given encoding.
+ *
+ * @throws java.io.FileNotFoundException if the file does not exist
+ * @throws IOException if the file cannot be read
+ */
+ public FileLineIterator(File file, Charset encoding, boolean skipFirstLine) throws IOException {
+ this(getFileInputStream(file), encoding, skipFirstLine);
+ }
+
+ public FileLineIterator(InputStream is) throws IOException {
+ this(is, Charsets.UTF_8, false);
+ }
+
+ public FileLineIterator(InputStream is, boolean skipFirstLine) throws IOException {
+ this(is, Charsets.UTF_8, skipFirstLine);
+ }
+
+ public FileLineIterator(InputStream is, Charset encoding, boolean skipFirstLine) throws IOException {
+ reader = new BufferedReader(new InputStreamReader(is, encoding));
+ if (skipFirstLine) {
+ reader.readLine();
+ }
+ }
+
+ public FileLineIterator(InputStream is, Charset encoding, boolean skipFirstLine, String filename)
+ throws IOException {
+ InputStream compressedInputStream;
+
+ if ("gz".equalsIgnoreCase(Files.getFileExtension(filename.toLowerCase()))) {
+ compressedInputStream = new GZIPInputStream(is);
+ } else if ("zip".equalsIgnoreCase(Files.getFileExtension(filename.toLowerCase()))) {
+ compressedInputStream = new ZipInputStream(is);
+ } else {
+ compressedInputStream = is;
+ }
+
+ reader = new BufferedReader(new InputStreamReader(compressedInputStream, encoding));
+ if (skipFirstLine) {
+ reader.readLine();
+ }
+ }
+
+ static InputStream getFileInputStream(File file) throws IOException {
+ InputStream is = new FileInputStream(file);
+ String name = file.getName();
+ if ("gz".equalsIgnoreCase(Files.getFileExtension(name.toLowerCase()))) {
+ return new GZIPInputStream(is);
+ } else if ("zip".equalsIgnoreCase(Files.getFileExtension(name.toLowerCase()))) {
+ return new ZipInputStream(is);
+ } else {
+ return is;
+ }
+ }
+
+ @Override
+ protected String computeNext() {
+ String line;
+ try {
+ line = reader.readLine();
+ } catch (IOException ioe) {
+ try {
+ close();
+ } catch (IOException e) {
+ log.error(e.getMessage(), e);
+ }
+ throw new IllegalStateException(ioe);
+ }
+ return line == null ? endOfData() : line;
+ }
+
+
+ @Override
+ public void skip(int n) {
+ try {
+ for (int i = 0; i < n; i++) {
+ if (reader.readLine() == null) {
+ break;
+ }
+ }
+ } catch (IOException ioe) {
+ try {
+ close();
+ } catch (IOException e) {
+ throw new IllegalStateException(e);
+ }
+ }
+ }
+
+ @Override
+ public void close() throws IOException {
+ endOfData();
+ Closeables.close(reader, true);
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/FixedSizeSamplingIterator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/FixedSizeSamplingIterator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/FixedSizeSamplingIterator.java
new file mode 100644
index 0000000..1905654
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/FixedSizeSamplingIterator.java
@@ -0,0 +1,59 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common.iterator;
+
+import java.util.Iterator;
+import java.util.List;
+import java.util.Random;
+
+import com.google.common.collect.ForwardingIterator;
+import com.google.common.collect.Lists;
+import org.apache.mahout.common.RandomUtils;
+
+/**
+ * Sample a fixed number of elements from an Iterator. The results can appear in any order.
+ */
+public final class FixedSizeSamplingIterator<T> extends ForwardingIterator<T> {
+
+ private final Iterator<T> delegate;
+
+ public FixedSizeSamplingIterator(int size, Iterator<T> source) {
+ List<T> buf = Lists.newArrayListWithCapacity(size);
+ int sofar = 0;
+ Random random = RandomUtils.getRandom();
+ while (source.hasNext()) {
+ T v = source.next();
+ sofar++;
+ if (buf.size() < size) {
+ buf.add(v);
+ } else {
+ int position = random.nextInt(sofar);
+ if (position < buf.size()) {
+ buf.set(position, v);
+ }
+ }
+ }
+ delegate = buf.iterator();
+ }
+
+ @Override
+ protected Iterator<T> delegate() {
+ return delegate;
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/SamplingIterable.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/SamplingIterable.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/SamplingIterable.java
new file mode 100644
index 0000000..425b44b
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/SamplingIterable.java
@@ -0,0 +1,45 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common.iterator;
+
+import java.util.Iterator;
+
+/**
+ * Wraps an {@link Iterable} whose {@link Iterable#iterator()} returns only some subset of the elements that
+ * it would, as determined by a iterator rate parameter.
+ */
+public final class SamplingIterable<T> implements Iterable<T> {
+
+ private final Iterable<? extends T> delegate;
+ private final double samplingRate;
+
+ public SamplingIterable(Iterable<? extends T> delegate, double samplingRate) {
+ this.delegate = delegate;
+ this.samplingRate = samplingRate;
+ }
+
+ @Override
+ public Iterator<T> iterator() {
+ return new SamplingIterator<>(delegate.iterator(), samplingRate);
+ }
+
+ public static <T> Iterable<T> maybeWrapIterable(Iterable<T> delegate, double samplingRate) {
+ return samplingRate >= 1.0 ? delegate : new SamplingIterable<>(delegate, samplingRate);
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/SamplingIterator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/SamplingIterator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/SamplingIterator.java
new file mode 100644
index 0000000..2ba46fd
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/SamplingIterator.java
@@ -0,0 +1,73 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common.iterator;
+
+import java.util.Iterator;
+
+import com.google.common.base.Preconditions;
+import com.google.common.collect.AbstractIterator;
+import org.apache.commons.math3.distribution.PascalDistribution;
+import org.apache.mahout.cf.taste.impl.common.SkippingIterator;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.common.RandomWrapper;
+
+/**
+ * Wraps an {@link Iterator} and returns only some subset of the elements that it would, as determined by a
+ * iterator rate parameter.
+ */
+public final class SamplingIterator<T> extends AbstractIterator<T> {
+
+ private final PascalDistribution geometricDistribution;
+ private final Iterator<? extends T> delegate;
+
+ public SamplingIterator(Iterator<? extends T> delegate, double samplingRate) {
+ this(RandomUtils.getRandom(), delegate, samplingRate);
+ }
+
+ public SamplingIterator(RandomWrapper random, Iterator<? extends T> delegate, double samplingRate) {
+ Preconditions.checkNotNull(delegate);
+ Preconditions.checkArgument(samplingRate > 0.0 && samplingRate <= 1.0,
+ "Must be: 0.0 < samplingRate <= 1.0. But samplingRate = " + samplingRate);
+ // Geometric distribution is special case of negative binomial (aka Pascal) with r=1:
+ geometricDistribution = new PascalDistribution(random.getRandomGenerator(), 1, samplingRate);
+ this.delegate = delegate;
+ }
+
+ @Override
+ protected T computeNext() {
+ int toSkip = geometricDistribution.sample();
+ if (delegate instanceof SkippingIterator<?>) {
+ SkippingIterator<? extends T> skippingDelegate = (SkippingIterator<? extends T>) delegate;
+ skippingDelegate.skip(toSkip);
+ if (skippingDelegate.hasNext()) {
+ return skippingDelegate.next();
+ }
+ } else {
+ for (int i = 0; i < toSkip && delegate.hasNext(); i++) {
+ delegate.next();
+ }
+ if (delegate.hasNext()) {
+ return delegate.next();
+ }
+ }
+ return endOfData();
+ }
+
+
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/StableFixedSizeSamplingIterator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/StableFixedSizeSamplingIterator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/StableFixedSizeSamplingIterator.java
new file mode 100644
index 0000000..c4ddf7b
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/StableFixedSizeSamplingIterator.java
@@ -0,0 +1,72 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common.iterator;
+
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Random;
+
+import com.google.common.base.Function;
+import com.google.common.collect.ForwardingIterator;
+import com.google.common.collect.Iterators;
+import com.google.common.collect.Lists;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.RandomUtils;
+
+/**
+ * Sample a fixed number of elements from an Iterator. The results will appear in the original order at some
+ * cost in time and memory relative to a FixedSizeSampler.
+ */
+public class StableFixedSizeSamplingIterator<T> extends ForwardingIterator<T> {
+
+ private final Iterator<T> delegate;
+
+ public StableFixedSizeSamplingIterator(int size, Iterator<T> source) {
+ List<Pair<Integer,T>> buf = Lists.newArrayListWithCapacity(size);
+ int sofar = 0;
+ Random random = RandomUtils.getRandom();
+ while (source.hasNext()) {
+ T v = source.next();
+ sofar++;
+ if (buf.size() < size) {
+ buf.add(new Pair<>(sofar, v));
+ } else {
+ int position = random.nextInt(sofar);
+ if (position < buf.size()) {
+ buf.set(position, new Pair<>(sofar, v));
+ }
+ }
+ }
+
+ Collections.sort(buf);
+ delegate = Iterators.transform(buf.iterator(),
+ new Function<Pair<Integer,T>,T>() {
+ @Override
+ public T apply(Pair<Integer,T> from) {
+ return from.getSecond();
+ }
+ });
+ }
+
+ @Override
+ protected Iterator<T> delegate() {
+ return delegate;
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/StringRecordIterator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/StringRecordIterator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/StringRecordIterator.java
new file mode 100644
index 0000000..73b841e
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/common/iterator/StringRecordIterator.java
@@ -0,0 +1,55 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.common.iterator;
+
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.List;
+import java.util.regex.Pattern;
+
+import com.google.common.base.Function;
+import com.google.common.collect.ForwardingIterator;
+import com.google.common.collect.Iterators;
+import org.apache.mahout.common.Pair;
+
+public class StringRecordIterator extends ForwardingIterator<Pair<List<String>,Long>> {
+
+ private static final Long ONE = 1L;
+
+ private final Pattern splitter;
+ private final Iterator<Pair<List<String>,Long>> delegate;
+
+ public StringRecordIterator(Iterable<String> stringIterator, String pattern) {
+ this.splitter = Pattern.compile(pattern);
+ delegate = Iterators.transform(
+ stringIterator.iterator(),
+ new Function<String,Pair<List<String>,Long>>() {
+ @Override
+ public Pair<List<String>,Long> apply(String from) {
+ String[] items = splitter.split(from);
+ return new Pair<>(Arrays.asList(items), ONE);
+ }
+ });
+ }
+
+ @Override
+ protected Iterator<Pair<List<String>,Long>> delegate() {
+ return delegate;
+ }
+
+}
r***@apache.org
2018-06-28 14:54:34 UTC
Permalink
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/lda/cvb/CachingCVB0Mapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/lda/cvb/CachingCVB0Mapper.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/lda/cvb/CachingCVB0Mapper.java
new file mode 100644
index 0000000..96f36d4
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/lda/cvb/CachingCVB0Mapper.java
@@ -0,0 +1,133 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.clustering.lda.cvb;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.MatrixSlice;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+
+/**
+ * Run ensemble learning via loading the {@link ModelTrainer} with two {@link TopicModel} instances:
+ * one from the previous iteration, the other empty. Inference is done on the first, and the
+ * learning updates are stored in the second, and only emitted at cleanup().
+ * <p/>
+ * In terms of obvious performance improvements still available, the memory footprint in this
+ * Mapper could be dropped by half if we accumulated model updates onto the model we're using
+ * for inference, which might also speed up convergence, as we'd be able to take advantage of
+ * learning <em>during</em> iteration, not just after each one is done. Most likely we don't
+ * really need to accumulate double values in the model either, floats would most likely be
+ * sufficient. Between these two, we could squeeze another factor of 4 in memory efficiency.
+ * <p/>
+ * In terms of CPU, we're re-learning the p(topic|doc) distribution on every iteration, starting
+ * from scratch. This is usually only 10 fixed-point iterations per doc, but that's 10x more than
+ * only 1. To avoid having to do this, we would need to do a map-side join of the unchanging
+ * corpus with the continually-improving p(topic|doc) matrix, and then emit multiple outputs
+ * from the mappers to make sure we can do the reduce model averaging as well. Tricky, but
+ * possibly worth it.
+ * <p/>
+ * {@link ModelTrainer} already takes advantage (in maybe the not-nice way) of multi-core
+ * availability by doing multithreaded learning, see that class for details.
+ */
+public class CachingCVB0Mapper
+ extends Mapper<IntWritable, VectorWritable, IntWritable, VectorWritable> {
+
+ private static final Logger log = LoggerFactory.getLogger(CachingCVB0Mapper.class);
+
+ private ModelTrainer modelTrainer;
+ private TopicModel readModel;
+ private TopicModel writeModel;
+ private int maxIters;
+ private int numTopics;
+
+ protected ModelTrainer getModelTrainer() {
+ return modelTrainer;
+ }
+
+ protected int getMaxIters() {
+ return maxIters;
+ }
+
+ protected int getNumTopics() {
+ return numTopics;
+ }
+
+ @Override
+ protected void setup(Context context) throws IOException, InterruptedException {
+ log.info("Retrieving configuration");
+ Configuration conf = context.getConfiguration();
+ float eta = conf.getFloat(CVB0Driver.TERM_TOPIC_SMOOTHING, Float.NaN);
+ float alpha = conf.getFloat(CVB0Driver.DOC_TOPIC_SMOOTHING, Float.NaN);
+ long seed = conf.getLong(CVB0Driver.RANDOM_SEED, 1234L);
+ numTopics = conf.getInt(CVB0Driver.NUM_TOPICS, -1);
+ int numTerms = conf.getInt(CVB0Driver.NUM_TERMS, -1);
+ int numUpdateThreads = conf.getInt(CVB0Driver.NUM_UPDATE_THREADS, 1);
+ int numTrainThreads = conf.getInt(CVB0Driver.NUM_TRAIN_THREADS, 4);
+ maxIters = conf.getInt(CVB0Driver.MAX_ITERATIONS_PER_DOC, 10);
+ float modelWeight = conf.getFloat(CVB0Driver.MODEL_WEIGHT, 1.0f);
+
+ log.info("Initializing read model");
+ Path[] modelPaths = CVB0Driver.getModelPaths(conf);
+ if (modelPaths != null && modelPaths.length > 0) {
+ readModel = new TopicModel(conf, eta, alpha, null, numUpdateThreads, modelWeight, modelPaths);
+ } else {
+ log.info("No model files found");
+ readModel = new TopicModel(numTopics, numTerms, eta, alpha, RandomUtils.getRandom(seed), null,
+ numTrainThreads, modelWeight);
+ }
+
+ log.info("Initializing write model");
+ writeModel = modelWeight == 1
+ ? new TopicModel(numTopics, numTerms, eta, alpha, null, numUpdateThreads)
+ : readModel;
+
+ log.info("Initializing model trainer");
+ modelTrainer = new ModelTrainer(readModel, writeModel, numTrainThreads, numTopics, numTerms);
+ modelTrainer.start();
+ }
+
+ @Override
+ public void map(IntWritable docId, VectorWritable document, Context context)
+ throws IOException, InterruptedException {
+ /* where to get docTopics? */
+ Vector topicVector = new DenseVector(numTopics).assign(1.0 / numTopics);
+ modelTrainer.train(document.get(), topicVector, true, maxIters);
+ }
+
+ @Override
+ protected void cleanup(Context context) throws IOException, InterruptedException {
+ log.info("Stopping model trainer");
+ modelTrainer.stop();
+
+ log.info("Writing model");
+ TopicModel readFrom = modelTrainer.getReadModel();
+ for (MatrixSlice topic : readFrom) {
+ context.write(new IntWritable(topic.index()), new VectorWritable(topic.vector()));
+ }
+ readModel.stop();
+ writeModel.stop();
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/lda/cvb/CachingCVB0PerplexityMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/lda/cvb/CachingCVB0PerplexityMapper.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/lda/cvb/CachingCVB0PerplexityMapper.java
new file mode 100644
index 0000000..da77baf
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/lda/cvb/CachingCVB0PerplexityMapper.java
@@ -0,0 +1,108 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.clustering.lda.cvb;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.DoubleWritable;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.common.MemoryUtil;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.Random;
+
+public class CachingCVB0PerplexityMapper extends
+ Mapper<IntWritable, VectorWritable, DoubleWritable, DoubleWritable> {
+ /**
+ * Hadoop counters for {@link CachingCVB0PerplexityMapper}, to aid in debugging.
+ */
+ public enum Counters {
+ SAMPLED_DOCUMENTS
+ }
+
+ private static final Logger log = LoggerFactory.getLogger(CachingCVB0PerplexityMapper.class);
+
+ private ModelTrainer modelTrainer;
+ private TopicModel readModel;
+ private int maxIters;
+ private int numTopics;
+ private float testFraction;
+ private Random random;
+ private Vector topicVector;
+ private final DoubleWritable outKey = new DoubleWritable();
+ private final DoubleWritable outValue = new DoubleWritable();
+
+ @Override
+ protected void setup(Context context) throws IOException, InterruptedException {
+ MemoryUtil.startMemoryLogger(5000);
+
+ log.info("Retrieving configuration");
+ Configuration conf = context.getConfiguration();
+ float eta = conf.getFloat(CVB0Driver.TERM_TOPIC_SMOOTHING, Float.NaN);
+ float alpha = conf.getFloat(CVB0Driver.DOC_TOPIC_SMOOTHING, Float.NaN);
+ long seed = conf.getLong(CVB0Driver.RANDOM_SEED, 1234L);
+ random = RandomUtils.getRandom(seed);
+ numTopics = conf.getInt(CVB0Driver.NUM_TOPICS, -1);
+ int numTerms = conf.getInt(CVB0Driver.NUM_TERMS, -1);
+ int numUpdateThreads = conf.getInt(CVB0Driver.NUM_UPDATE_THREADS, 1);
+ int numTrainThreads = conf.getInt(CVB0Driver.NUM_TRAIN_THREADS, 4);
+ maxIters = conf.getInt(CVB0Driver.MAX_ITERATIONS_PER_DOC, 10);
+ float modelWeight = conf.getFloat(CVB0Driver.MODEL_WEIGHT, 1.0f);
+ testFraction = conf.getFloat(CVB0Driver.TEST_SET_FRACTION, 0.1f);
+
+ log.info("Initializing read model");
+ Path[] modelPaths = CVB0Driver.getModelPaths(conf);
+ if (modelPaths != null && modelPaths.length > 0) {
+ readModel = new TopicModel(conf, eta, alpha, null, numUpdateThreads, modelWeight, modelPaths);
+ } else {
+ log.info("No model files found");
+ readModel = new TopicModel(numTopics, numTerms, eta, alpha, RandomUtils.getRandom(seed), null,
+ numTrainThreads, modelWeight);
+ }
+
+ log.info("Initializing model trainer");
+ modelTrainer = new ModelTrainer(readModel, null, numTrainThreads, numTopics, numTerms);
+
+ log.info("Initializing topic vector");
+ topicVector = new DenseVector(new double[numTopics]);
+ }
+
+ @Override
+ protected void cleanup(Context context) throws IOException, InterruptedException {
+ readModel.stop();
+ MemoryUtil.stopMemoryLogger();
+ }
+
+ @Override
+ public void map(IntWritable docId, VectorWritable document, Context context)
+ throws IOException, InterruptedException {
+ if (testFraction < 1.0f && random.nextFloat() >= testFraction) {
+ return;
+ }
+ context.getCounter(Counters.SAMPLED_DOCUMENTS).increment(1);
+ outKey.set(document.get().norm(1));
+ outValue.set(modelTrainer.calculatePerplexity(document.get(), topicVector.assign(1.0 / numTopics), maxIters));
+ context.write(outKey, outValue);
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/lda/cvb/InMemoryCollapsedVariationalBayes0.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/lda/cvb/InMemoryCollapsedVariationalBayes0.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/lda/cvb/InMemoryCollapsedVariationalBayes0.java
new file mode 100644
index 0000000..d7d09c5
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/lda/cvb/InMemoryCollapsedVariationalBayes0.java
@@ -0,0 +1,492 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.clustering.lda.cvb;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.OptionException;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.commandline.Parser;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.CommandLineUtil;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable;
+import org.apache.mahout.math.DenseMatrix;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.DistributedRowMatrixWriter;
+import org.apache.mahout.math.Matrix;
+import org.apache.mahout.math.NamedVector;
+import org.apache.mahout.math.SparseRowMatrix;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Runs the same algorithm as {@link CVB0Driver}, but sequentially, in memory. Memory requirements
+ * are currently: the entire corpus is read into RAM, two copies of the model (each of size
+ * numTerms * numTopics), and another matrix of size numDocs * numTopics is held in memory
+ * (to store p(topic|doc) for all docs).
+ *
+ * But if all this fits in memory, this can be significantly faster than an iterative MR job.
+ */
+public class InMemoryCollapsedVariationalBayes0 extends AbstractJob {
+
+ private static final Logger log = LoggerFactory.getLogger(InMemoryCollapsedVariationalBayes0.class);
+
+ private int numTopics;
+ private int numTerms;
+ private int numDocuments;
+ private double alpha;
+ private double eta;
+ //private int minDfCt;
+ //private double maxDfPct;
+ private boolean verbose = false;
+ private String[] terms; // of length numTerms;
+ private Matrix corpusWeights; // length numDocs;
+ private double totalCorpusWeight;
+ private double initialModelCorpusFraction;
+ private Matrix docTopicCounts;
+ private int numTrainingThreads;
+ private int numUpdatingThreads;
+ private ModelTrainer modelTrainer;
+
+ private InMemoryCollapsedVariationalBayes0() {
+ // only for main usage
+ }
+
+ public void setVerbose(boolean verbose) {
+ this.verbose = verbose;
+ }
+
+ public InMemoryCollapsedVariationalBayes0(Matrix corpus,
+ String[] terms,
+ int numTopics,
+ double alpha,
+ double eta,
+ int numTrainingThreads,
+ int numUpdatingThreads,
+ double modelCorpusFraction) {
+ //this.seed = seed;
+ this.numTopics = numTopics;
+ this.alpha = alpha;
+ this.eta = eta;
+ //this.minDfCt = 0;
+ //this.maxDfPct = 1.0f;
+ corpusWeights = corpus;
+ numDocuments = corpus.numRows();
+ this.terms = terms;
+ this.initialModelCorpusFraction = modelCorpusFraction;
+ numTerms = terms != null ? terms.length : corpus.numCols();
+ Map<String, Integer> termIdMap = new HashMap<>();
+ if (terms != null) {
+ for (int t = 0; t < terms.length; t++) {
+ termIdMap.put(terms[t], t);
+ }
+ }
+ this.numTrainingThreads = numTrainingThreads;
+ this.numUpdatingThreads = numUpdatingThreads;
+ postInitCorpus();
+ initializeModel();
+ }
+
+ private void postInitCorpus() {
+ totalCorpusWeight = 0;
+ int numNonZero = 0;
+ for (int i = 0; i < numDocuments; i++) {
+ Vector v = corpusWeights.viewRow(i);
+ double norm;
+ if (v != null && (norm = v.norm(1)) != 0) {
+ numNonZero += v.getNumNondefaultElements();
+ totalCorpusWeight += norm;
+ }
+ }
+ String s = "Initializing corpus with %d docs, %d terms, %d nonzero entries, total termWeight %f";
+ log.info(String.format(s, numDocuments, numTerms, numNonZero, totalCorpusWeight));
+ }
+
+ private void initializeModel() {
+ TopicModel topicModel = new TopicModel(numTopics, numTerms, eta, alpha, RandomUtils.getRandom(), terms,
+ numUpdatingThreads, initialModelCorpusFraction == 0 ? 1 : initialModelCorpusFraction * totalCorpusWeight);
+ topicModel.setConf(getConf());
+
+ TopicModel updatedModel = initialModelCorpusFraction == 0
+ ? new TopicModel(numTopics, numTerms, eta, alpha, null, terms, numUpdatingThreads, 1)
+ : topicModel;
+ updatedModel.setConf(getConf());
+ docTopicCounts = new DenseMatrix(numDocuments, numTopics);
+ docTopicCounts.assign(1.0 / numTopics);
+ modelTrainer = new ModelTrainer(topicModel, updatedModel, numTrainingThreads, numTopics, numTerms);
+ }
+
+ /*
+ private void inferDocuments(double convergence, int maxIter, boolean recalculate) {
+ for (int docId = 0; docId < corpusWeights.numRows() ; docId++) {
+ Vector inferredDocument = topicModel.infer(corpusWeights.viewRow(docId),
+ docTopicCounts.viewRow(docId));
+ // do what now?
+ }
+ }
+ */
+
+ public void trainDocuments() {
+ trainDocuments(0);
+ }
+
+ public void trainDocuments(double testFraction) {
+ long start = System.nanoTime();
+ modelTrainer.start();
+ for (int docId = 0; docId < corpusWeights.numRows(); docId++) {
+ if (testFraction == 0 || docId % (1 / testFraction) != 0) {
+ Vector docTopics = new DenseVector(numTopics).assign(1.0 / numTopics); // docTopicCounts.getRow(docId)
+ modelTrainer.trainSync(corpusWeights.viewRow(docId), docTopics , true, 10);
+ }
+ }
+ modelTrainer.stop();
+ logTime("train documents", System.nanoTime() - start);
+ }
+
+ /*
+ private double error(int docId) {
+ Vector docTermCounts = corpusWeights.viewRow(docId);
+ if (docTermCounts == null) {
+ return 0;
+ } else {
+ Vector expectedDocTermCounts =
+ topicModel.infer(corpusWeights.viewRow(docId), docTopicCounts.viewRow(docId));
+ double expectedNorm = expectedDocTermCounts.norm(1);
+ return expectedDocTermCounts.times(docTermCounts.norm(1)/expectedNorm)
+ .minus(docTermCounts).norm(1);
+ }
+ }
+
+ private double error() {
+ long time = System.nanoTime();
+ double error = 0;
+ for (int docId = 0; docId < numDocuments; docId++) {
+ error += error(docId);
+ }
+ logTime("error calculation", System.nanoTime() - time);
+ return error / totalCorpusWeight;
+ }
+ */
+
+ public double iterateUntilConvergence(double minFractionalErrorChange,
+ int maxIterations, int minIter) {
+ return iterateUntilConvergence(minFractionalErrorChange, maxIterations, minIter, 0);
+ }
+
+ public double iterateUntilConvergence(double minFractionalErrorChange,
+ int maxIterations, int minIter, double testFraction) {
+ int iter = 0;
+ double oldPerplexity = 0;
+ while (iter < minIter) {
+ trainDocuments(testFraction);
+ if (verbose) {
+ log.info("model after: {}: {}", iter, modelTrainer.getReadModel());
+ }
+ log.info("iteration {} complete", iter);
+ oldPerplexity = modelTrainer.calculatePerplexity(corpusWeights, docTopicCounts,
+ testFraction);
+ log.info("{} = perplexity", oldPerplexity);
+ iter++;
+ }
+ double newPerplexity = 0;
+ double fractionalChange = Double.MAX_VALUE;
+ while (iter < maxIterations && fractionalChange > minFractionalErrorChange) {
+ trainDocuments();
+ if (verbose) {
+ log.info("model after: {}: {}", iter, modelTrainer.getReadModel());
+ }
+ newPerplexity = modelTrainer.calculatePerplexity(corpusWeights, docTopicCounts,
+ testFraction);
+ log.info("{} = perplexity", newPerplexity);
+ iter++;
+ fractionalChange = Math.abs(newPerplexity - oldPerplexity) / oldPerplexity;
+ log.info("{} = fractionalChange", fractionalChange);
+ oldPerplexity = newPerplexity;
+ }
+ if (iter < maxIterations) {
+ log.info(String.format("Converged! fractional error change: %f, error %f",
+ fractionalChange, newPerplexity));
+ } else {
+ log.info(String.format("Reached max iteration count (%d), fractional error change: %f, error: %f",
+ maxIterations, fractionalChange, newPerplexity));
+ }
+ return newPerplexity;
+ }
+
+ public void writeModel(Path outputPath) throws IOException {
+ modelTrainer.persist(outputPath);
+ }
+
+ private static void logTime(String label, long nanos) {
+ log.info("{} time: {}ms", label, nanos / 1.0e6);
+ }
+
+ public static int main2(String[] args, Configuration conf) throws Exception {
+ DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
+ ArgumentBuilder abuilder = new ArgumentBuilder();
+ GroupBuilder gbuilder = new GroupBuilder();
+
+ Option helpOpt = DefaultOptionCreator.helpOption();
+
+ Option inputDirOpt = obuilder.withLongName("input").withRequired(true).withArgument(
+ abuilder.withName("input").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The Directory on HDFS containing the collapsed, properly formatted files having "
+ + "one doc per line").withShortName("i").create();
+
+ Option dictOpt = obuilder.withLongName("dictionary").withRequired(false).withArgument(
+ abuilder.withName("dictionary").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The path to the term-dictionary format is ... ").withShortName("d").create();
+
+ Option dfsOpt = obuilder.withLongName("dfs").withRequired(false).withArgument(
+ abuilder.withName("dfs").withMinimum(1).withMaximum(1).create()).withDescription(
+ "HDFS namenode URI").withShortName("dfs").create();
+
+ Option numTopicsOpt = obuilder.withLongName("numTopics").withRequired(true).withArgument(abuilder
+ .withName("numTopics").withMinimum(1).withMaximum(1)
+ .create()).withDescription("Number of topics to learn").withShortName("top").create();
+
+ Option outputTopicFileOpt = obuilder.withLongName("topicOutputFile").withRequired(true).withArgument(
+ abuilder.withName("topicOutputFile").withMinimum(1).withMaximum(1).create())
+ .withDescription("File to write out p(term | topic)").withShortName("to").create();
+
+ Option outputDocFileOpt = obuilder.withLongName("docOutputFile").withRequired(true).withArgument(
+ abuilder.withName("docOutputFile").withMinimum(1).withMaximum(1).create())
+ .withDescription("File to write out p(topic | docid)").withShortName("do").create();
+
+ Option alphaOpt = obuilder.withLongName("alpha").withRequired(false).withArgument(abuilder
+ .withName("alpha").withMinimum(1).withMaximum(1).withDefault("0.1").create())
+ .withDescription("Smoothing parameter for p(topic | document) prior").withShortName("a").create();
+
+ Option etaOpt = obuilder.withLongName("eta").withRequired(false).withArgument(abuilder
+ .withName("eta").withMinimum(1).withMaximum(1).withDefault("0.1").create())
+ .withDescription("Smoothing parameter for p(term | topic)").withShortName("e").create();
+
+ Option maxIterOpt = obuilder.withLongName("maxIterations").withRequired(false).withArgument(abuilder
+ .withName("maxIterations").withMinimum(1).withMaximum(1).withDefault("10").create())
+ .withDescription("Maximum number of training passes").withShortName("m").create();
+
+ Option modelCorpusFractionOption = obuilder.withLongName("modelCorpusFraction")
+ .withRequired(false).withArgument(abuilder.withName("modelCorpusFraction").withMinimum(1)
+ .withMaximum(1).withDefault("0.0").create()).withShortName("mcf")
+ .withDescription("For online updates, initial value of |model|/|corpus|").create();
+
+ Option burnInOpt = obuilder.withLongName("burnInIterations").withRequired(false).withArgument(abuilder
+ .withName("burnInIterations").withMinimum(1).withMaximum(1).withDefault("5").create())
+ .withDescription("Minimum number of iterations").withShortName("b").create();
+
+ Option convergenceOpt = obuilder.withLongName("convergence").withRequired(false).withArgument(abuilder
+ .withName("convergence").withMinimum(1).withMaximum(1).withDefault("0.0").create())
+ .withDescription("Fractional rate of perplexity to consider convergence").withShortName("c").create();
+
+ Option reInferDocTopicsOpt = obuilder.withLongName("reInferDocTopics").withRequired(false)
+ .withArgument(abuilder.withName("reInferDocTopics").withMinimum(1).withMaximum(1)
+ .withDefault("no").create())
+ .withDescription("re-infer p(topic | doc) : [no | randstart | continue]")
+ .withShortName("rdt").create();
+
+ Option numTrainThreadsOpt = obuilder.withLongName("numTrainThreads").withRequired(false)
+ .withArgument(abuilder.withName("numTrainThreads").withMinimum(1).withMaximum(1)
+ .withDefault("1").create())
+ .withDescription("number of threads to train with")
+ .withShortName("ntt").create();
+
+ Option numUpdateThreadsOpt = obuilder.withLongName("numUpdateThreads").withRequired(false)
+ .withArgument(abuilder.withName("numUpdateThreads").withMinimum(1).withMaximum(1)
+ .withDefault("1").create())
+ .withDescription("number of threads to update the model with")
+ .withShortName("nut").create();
+
+ Option verboseOpt = obuilder.withLongName("verbose").withRequired(false)
+ .withArgument(abuilder.withName("verbose").withMinimum(1).withMaximum(1)
+ .withDefault("false").create())
+ .withDescription("print verbose information, like top-terms in each topic, during iteration")
+ .withShortName("v").create();
+
+ Group group = gbuilder.withName("Options").withOption(inputDirOpt).withOption(numTopicsOpt)
+ .withOption(alphaOpt).withOption(etaOpt)
+ .withOption(maxIterOpt).withOption(burnInOpt).withOption(convergenceOpt)
+ .withOption(dictOpt).withOption(reInferDocTopicsOpt)
+ .withOption(outputDocFileOpt).withOption(outputTopicFileOpt).withOption(dfsOpt)
+ .withOption(numTrainThreadsOpt).withOption(numUpdateThreadsOpt)
+ .withOption(modelCorpusFractionOption).withOption(verboseOpt).create();
+
+ try {
+ Parser parser = new Parser();
+
+ parser.setGroup(group);
+ parser.setHelpOption(helpOpt);
+ CommandLine cmdLine = parser.parse(args);
+ if (cmdLine.hasOption(helpOpt)) {
+ CommandLineUtil.printHelp(group);
+ return -1;
+ }
+
+ String inputDirString = (String) cmdLine.getValue(inputDirOpt);
+ String dictDirString = cmdLine.hasOption(dictOpt) ? (String)cmdLine.getValue(dictOpt) : null;
+ int numTopics = Integer.parseInt((String) cmdLine.getValue(numTopicsOpt));
+ double alpha = Double.parseDouble((String)cmdLine.getValue(alphaOpt));
+ double eta = Double.parseDouble((String)cmdLine.getValue(etaOpt));
+ int maxIterations = Integer.parseInt((String)cmdLine.getValue(maxIterOpt));
+ int burnInIterations = Integer.parseInt((String)cmdLine.getValue(burnInOpt));
+ double minFractionalErrorChange = Double.parseDouble((String) cmdLine.getValue(convergenceOpt));
+ int numTrainThreads = Integer.parseInt((String)cmdLine.getValue(numTrainThreadsOpt));
+ int numUpdateThreads = Integer.parseInt((String)cmdLine.getValue(numUpdateThreadsOpt));
+ String topicOutFile = (String)cmdLine.getValue(outputTopicFileOpt);
+ String docOutFile = (String)cmdLine.getValue(outputDocFileOpt);
+ //String reInferDocTopics = (String)cmdLine.getValue(reInferDocTopicsOpt);
+ boolean verbose = Boolean.parseBoolean((String) cmdLine.getValue(verboseOpt));
+ double modelCorpusFraction = Double.parseDouble((String)cmdLine.getValue(modelCorpusFractionOption));
+
+ long start = System.nanoTime();
+
+ if (conf.get("fs.default.name") == null) {
+ String dfsNameNode = (String)cmdLine.getValue(dfsOpt);
+ conf.set("fs.default.name", dfsNameNode);
+ }
+ String[] terms = loadDictionary(dictDirString, conf);
+ logTime("dictionary loading", System.nanoTime() - start);
+ start = System.nanoTime();
+ Matrix corpus = loadVectors(inputDirString, conf);
+ logTime("vector seqfile corpus loading", System.nanoTime() - start);
+ start = System.nanoTime();
+ InMemoryCollapsedVariationalBayes0 cvb0 =
+ new InMemoryCollapsedVariationalBayes0(corpus, terms, numTopics, alpha, eta,
+ numTrainThreads, numUpdateThreads, modelCorpusFraction);
+ logTime("cvb0 init", System.nanoTime() - start);
+
+ start = System.nanoTime();
+ cvb0.setVerbose(verbose);
+ cvb0.iterateUntilConvergence(minFractionalErrorChange, maxIterations, burnInIterations);
+ logTime("total training time", System.nanoTime() - start);
+
+ /*
+ if ("randstart".equalsIgnoreCase(reInferDocTopics)) {
+ cvb0.inferDocuments(0.0, 100, true);
+ } else if ("continue".equalsIgnoreCase(reInferDocTopics)) {
+ cvb0.inferDocuments(0.0, 100, false);
+ }
+ */
+
+ start = System.nanoTime();
+ cvb0.writeModel(new Path(topicOutFile));
+ DistributedRowMatrixWriter.write(new Path(docOutFile), conf, cvb0.docTopicCounts);
+ logTime("printTopics", System.nanoTime() - start);
+ } catch (OptionException e) {
+ log.error("Error while parsing options", e);
+ CommandLineUtil.printHelp(group);
+ }
+ return 0;
+ }
+
+ private static String[] loadDictionary(String dictionaryPath, Configuration conf) {
+ if (dictionaryPath == null) {
+ return null;
+ }
+ Path dictionaryFile = new Path(dictionaryPath);
+ List<Pair<Integer, String>> termList = new ArrayList<>();
+ int maxTermId = 0;
+ // key is word value is id
+ for (Pair<Writable, IntWritable> record
+ : new SequenceFileIterable<Writable, IntWritable>(dictionaryFile, true, conf)) {
+ termList.add(new Pair<>(record.getSecond().get(),
+ record.getFirst().toString()));
+ maxTermId = Math.max(maxTermId, record.getSecond().get());
+ }
+ String[] terms = new String[maxTermId + 1];
+ for (Pair<Integer, String> pair : termList) {
+ terms[pair.getFirst()] = pair.getSecond();
+ }
+ return terms;
+ }
+
+ @Override
+ public Configuration getConf() {
+ return super.getConf();
+ }
+
+ private static Matrix loadVectors(String vectorPathString, Configuration conf)
+ throws IOException {
+ Path vectorPath = new Path(vectorPathString);
+ FileSystem fs = vectorPath.getFileSystem(conf);
+ List<Path> subPaths = new ArrayList<>();
+ if (fs.isFile(vectorPath)) {
+ subPaths.add(vectorPath);
+ } else {
+ for (FileStatus fileStatus : fs.listStatus(vectorPath, PathFilters.logsCRCFilter())) {
+ subPaths.add(fileStatus.getPath());
+ }
+ }
+ List<Pair<Integer, Vector>> rowList = new ArrayList<>();
+ int numRows = Integer.MIN_VALUE;
+ int numCols = -1;
+ boolean sequentialAccess = false;
+ for (Path subPath : subPaths) {
+ for (Pair<IntWritable, VectorWritable> record
+ : new SequenceFileIterable<IntWritable, VectorWritable>(subPath, true, conf)) {
+ int id = record.getFirst().get();
+ Vector vector = record.getSecond().get();
+ if (vector instanceof NamedVector) {
+ vector = ((NamedVector)vector).getDelegate();
+ }
+ if (numCols < 0) {
+ numCols = vector.size();
+ sequentialAccess = vector.isSequentialAccess();
+ }
+ rowList.add(Pair.of(id, vector));
+ numRows = Math.max(numRows, id);
+ }
+ }
+ numRows++;
+ Vector[] rowVectors = new Vector[numRows];
+ for (Pair<Integer, Vector> pair : rowList) {
+ rowVectors[pair.getFirst()] = pair.getSecond();
+ }
+ return new SparseRowMatrix(numRows, numCols, rowVectors, true, !sequentialAccess);
+
+ }
+
+ @Override
+ public int run(String[] strings) throws Exception {
+ return main2(strings, getConf());
+ }
+
+ public static void main(String[] args) throws Exception {
+ ToolRunner.run(new InMemoryCollapsedVariationalBayes0(), args);
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/lda/cvb/ModelTrainer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/lda/cvb/ModelTrainer.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/lda/cvb/ModelTrainer.java
new file mode 100644
index 0000000..c3f2bc0
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/lda/cvb/ModelTrainer.java
@@ -0,0 +1,301 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.clustering.lda.cvb;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.ArrayBlockingQueue;
+import java.util.concurrent.BlockingQueue;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ThreadPoolExecutor;
+import java.util.concurrent.TimeUnit;
+
+import org.apache.hadoop.fs.Path;
+import org.apache.mahout.math.Matrix;
+import org.apache.mahout.math.MatrixSlice;
+import org.apache.mahout.math.SparseRowMatrix;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorIterable;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Multithreaded LDA model trainer class, which primarily operates by running a "map/reduce"
+ * operation, all in memory locally (ie not a hadoop job!) : the "map" operation is to take
+ * the "read-only" {@link TopicModel} and use it to iteratively learn the p(topic|term, doc)
+ * distribution for documents (this can be done in parallel across many documents, as the
+ * "read-only" model is, well, read-only. Then the outputs of this are "reduced" onto the
+ * "write" model, and these updates are not parallelizable in the same way: individual
+ * documents can't be added to the same entries in different threads at the same time, but
+ * updates across many topics to the same term from the same document can be done in parallel,
+ * so they are.
+ *
+ * Because computation is done asynchronously, when iteration is done, it's important to call
+ * the stop() method, which blocks until work is complete.
+ *
+ * Setting the read model and the write model to be the same object may not quite work yet,
+ * on account of parallelism badness.
+ */
+public class ModelTrainer {
+
+ private static final Logger log = LoggerFactory.getLogger(ModelTrainer.class);
+
+ private final int numTopics;
+ private final int numTerms;
+ private TopicModel readModel;
+ private TopicModel writeModel;
+ private ThreadPoolExecutor threadPool;
+ private BlockingQueue<Runnable> workQueue;
+ private final int numTrainThreads;
+ private final boolean isReadWrite;
+
+ public ModelTrainer(TopicModel initialReadModel, TopicModel initialWriteModel,
+ int numTrainThreads, int numTopics, int numTerms) {
+ this.readModel = initialReadModel;
+ this.writeModel = initialWriteModel;
+ this.numTrainThreads = numTrainThreads;
+ this.numTopics = numTopics;
+ this.numTerms = numTerms;
+ isReadWrite = initialReadModel == initialWriteModel;
+ }
+
+ /**
+ * WARNING: this constructor may not lead to good behavior. What should be verified is that
+ * the model updating process does not conflict with model reading. It might work, but then
+ * again, it might not!
+ * @param model to be used for both reading (inference) and accumulating (learning)
+ * @param numTrainThreads
+ * @param numTopics
+ * @param numTerms
+ */
+ public ModelTrainer(TopicModel model, int numTrainThreads, int numTopics, int numTerms) {
+ this(model, model, numTrainThreads, numTopics, numTerms);
+ }
+
+ public TopicModel getReadModel() {
+ return readModel;
+ }
+
+ public void start() {
+ log.info("Starting training threadpool with {} threads", numTrainThreads);
+ workQueue = new ArrayBlockingQueue<>(numTrainThreads * 10);
+ threadPool = new ThreadPoolExecutor(numTrainThreads, numTrainThreads, 0, TimeUnit.SECONDS,
+ workQueue);
+ threadPool.allowCoreThreadTimeOut(false);
+ threadPool.prestartAllCoreThreads();
+ writeModel.reset();
+ }
+
+ public void train(VectorIterable matrix, VectorIterable docTopicCounts) {
+ train(matrix, docTopicCounts, 1);
+ }
+
+ public double calculatePerplexity(VectorIterable matrix, VectorIterable docTopicCounts) {
+ return calculatePerplexity(matrix, docTopicCounts, 0);
+ }
+
+ public double calculatePerplexity(VectorIterable matrix, VectorIterable docTopicCounts,
+ double testFraction) {
+ Iterator<MatrixSlice> docIterator = matrix.iterator();
+ Iterator<MatrixSlice> docTopicIterator = docTopicCounts.iterator();
+ double perplexity = 0;
+ double matrixNorm = 0;
+ while (docIterator.hasNext() && docTopicIterator.hasNext()) {
+ MatrixSlice docSlice = docIterator.next();
+ MatrixSlice topicSlice = docTopicIterator.next();
+ int docId = docSlice.index();
+ Vector document = docSlice.vector();
+ Vector topicDist = topicSlice.vector();
+ if (testFraction == 0 || docId % (1 / testFraction) == 0) {
+ trainSync(document, topicDist, false, 10);
+ perplexity += readModel.perplexity(document, topicDist);
+ matrixNorm += document.norm(1);
+ }
+ }
+ return perplexity / matrixNorm;
+ }
+
+ public void train(VectorIterable matrix, VectorIterable docTopicCounts, int numDocTopicIters) {
+ start();
+ Iterator<MatrixSlice> docIterator = matrix.iterator();
+ Iterator<MatrixSlice> docTopicIterator = docTopicCounts.iterator();
+ long startTime = System.nanoTime();
+ int i = 0;
+ double[] times = new double[100];
+ Map<Vector, Vector> batch = new HashMap<>();
+ int numTokensInBatch = 0;
+ long batchStart = System.nanoTime();
+ while (docIterator.hasNext() && docTopicIterator.hasNext()) {
+ i++;
+ Vector document = docIterator.next().vector();
+ Vector topicDist = docTopicIterator.next().vector();
+ if (isReadWrite) {
+ if (batch.size() < numTrainThreads) {
+ batch.put(document, topicDist);
+ if (log.isDebugEnabled()) {
+ numTokensInBatch += document.getNumNondefaultElements();
+ }
+ } else {
+ batchTrain(batch, true, numDocTopicIters);
+ long time = System.nanoTime();
+ log.debug("trained {} docs with {} tokens, start time {}, end time {}",
+ numTrainThreads, numTokensInBatch, batchStart, time);
+ batchStart = time;
+ numTokensInBatch = 0;
+ }
+ } else {
+ long start = System.nanoTime();
+ train(document, topicDist, true, numDocTopicIters);
+ if (log.isDebugEnabled()) {
+ times[i % times.length] =
+ (System.nanoTime() - start) / (1.0e6 * document.getNumNondefaultElements());
+ if (i % 100 == 0) {
+ long time = System.nanoTime() - startTime;
+ log.debug("trained {} documents in {}ms", i, time / 1.0e6);
+ if (i % 500 == 0) {
+ Arrays.sort(times);
+ log.debug("training took median {}ms per token-instance", times[times.length / 2]);
+ }
+ }
+ }
+ }
+ }
+ stop();
+ }
+
+ public void batchTrain(Map<Vector, Vector> batch, boolean update, int numDocTopicsIters) {
+ while (true) {
+ try {
+ List<TrainerRunnable> runnables = new ArrayList<>();
+ for (Map.Entry<Vector, Vector> entry : batch.entrySet()) {
+ runnables.add(new TrainerRunnable(readModel, null, entry.getKey(),
+ entry.getValue(), new SparseRowMatrix(numTopics, numTerms, true),
+ numDocTopicsIters));
+ }
+ threadPool.invokeAll(runnables);
+ if (update) {
+ for (TrainerRunnable runnable : runnables) {
+ writeModel.update(runnable.docTopicModel);
+ }
+ }
+ break;
+ } catch (InterruptedException e) {
+ log.warn("Interrupted during batch training, retrying!", e);
+ }
+ }
+ }
+
+ public void train(Vector document, Vector docTopicCounts, boolean update, int numDocTopicIters) {
+ while (true) {
+ try {
+ workQueue.put(new TrainerRunnable(readModel, update
+ ? writeModel
+ : null, document, docTopicCounts, new SparseRowMatrix(numTopics, numTerms, true), numDocTopicIters));
+ return;
+ } catch (InterruptedException e) {
+ log.warn("Interrupted waiting to submit document to work queue: {}", document, e);
+ }
+ }
+ }
+
+ public void trainSync(Vector document, Vector docTopicCounts, boolean update,
+ int numDocTopicIters) {
+ new TrainerRunnable(readModel, update
+ ? writeModel
+ : null, document, docTopicCounts, new SparseRowMatrix(numTopics, numTerms, true), numDocTopicIters).run();
+ }
+
+ public double calculatePerplexity(Vector document, Vector docTopicCounts, int numDocTopicIters) {
+ TrainerRunnable runner = new TrainerRunnable(readModel, null, document, docTopicCounts,
+ new SparseRowMatrix(numTopics, numTerms, true), numDocTopicIters);
+ return runner.call();
+ }
+
+ public void stop() {
+ long startTime = System.nanoTime();
+ log.info("Initiating stopping of training threadpool");
+ try {
+ threadPool.shutdown();
+ if (!threadPool.awaitTermination(60, TimeUnit.SECONDS)) {
+ log.warn("Threadpool timed out on await termination - jobs still running!");
+ }
+ long newTime = System.nanoTime();
+ log.info("threadpool took: {}ms", (newTime - startTime) / 1.0e6);
+ startTime = newTime;
+ readModel.stop();
+ newTime = System.nanoTime();
+ log.info("readModel.stop() took {}ms", (newTime - startTime) / 1.0e6);
+ startTime = newTime;
+ writeModel.stop();
+ newTime = System.nanoTime();
+ log.info("writeModel.stop() took {}ms", (newTime - startTime) / 1.0e6);
+ TopicModel tmpModel = writeModel;
+ writeModel = readModel;
+ readModel = tmpModel;
+ } catch (InterruptedException e) {
+ log.error("Interrupted shutting down!", e);
+ }
+ }
+
+ public void persist(Path outputPath) throws IOException {
+ readModel.persist(outputPath, true);
+ }
+
+ private static final class TrainerRunnable implements Runnable, Callable<Double> {
+ private final TopicModel readModel;
+ private final TopicModel writeModel;
+ private final Vector document;
+ private final Vector docTopics;
+ private final Matrix docTopicModel;
+ private final int numDocTopicIters;
+
+ private TrainerRunnable(TopicModel readModel, TopicModel writeModel, Vector document,
+ Vector docTopics, Matrix docTopicModel, int numDocTopicIters) {
+ this.readModel = readModel;
+ this.writeModel = writeModel;
+ this.document = document;
+ this.docTopics = docTopics;
+ this.docTopicModel = docTopicModel;
+ this.numDocTopicIters = numDocTopicIters;
+ }
+
+ @Override
+ public void run() {
+ for (int i = 0; i < numDocTopicIters; i++) {
+ // synchronous read-only call:
+ readModel.trainDocTopicModel(document, docTopics, docTopicModel);
+ }
+ if (writeModel != null) {
+ // parallel call which is read-only on the docTopicModel, and write-only on the writeModel
+ // this method does not return until all rows of the docTopicModel have been submitted
+ // to write work queues
+ writeModel.update(docTopicModel);
+ }
+ }
+
+ @Override
+ public Double call() {
+ run();
+ return readModel.perplexity(document, docTopics);
+ }
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/lda/cvb/TopicModel.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/lda/cvb/TopicModel.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/lda/cvb/TopicModel.java
new file mode 100644
index 0000000..9ba77c1
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/lda/cvb/TopicModel.java
@@ -0,0 +1,513 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.clustering.lda.cvb;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Random;
+import java.util.concurrent.ArrayBlockingQueue;
+import java.util.concurrent.ThreadPoolExecutor;
+import java.util.concurrent.TimeUnit;
+
+import org.apache.hadoop.conf.Configurable;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable;
+import org.apache.mahout.math.DenseMatrix;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.DistributedRowMatrixWriter;
+import org.apache.mahout.math.Matrix;
+import org.apache.mahout.math.MatrixSlice;
+import org.apache.mahout.math.SequentialAccessSparseVector;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.Vector.Element;
+import org.apache.mahout.math.VectorWritable;
+import org.apache.mahout.math.function.Functions;
+import org.apache.mahout.math.stats.Sampler;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Thin wrapper around a {@link Matrix} of counts of occurrences of (topic, term) pairs. Dividing
+ * {code topicTermCount.viewRow(topic).get(term)} by the sum over the values for all terms in that
+ * row yields p(term | topic). Instead dividing it by all topic columns for that term yields
+ * p(topic | term).
+ *
+ * Multithreading is enabled for the {@code update(Matrix)} method: this method is async, and
+ * merely submits the matrix to a work queue. When all work has been submitted,
+ * {@code awaitTermination()} should be called, which will block until updates have been
+ * accumulated.
+ */
+public class TopicModel implements Configurable, Iterable<MatrixSlice> {
+
+ private static final Logger log = LoggerFactory.getLogger(TopicModel.class);
+
+ private final String[] dictionary;
+ private final Matrix topicTermCounts;
+ private final Vector topicSums;
+ private final int numTopics;
+ private final int numTerms;
+ private final double eta;
+ private final double alpha;
+
+ private Configuration conf;
+
+ private final Sampler sampler;
+ private final int numThreads;
+ private ThreadPoolExecutor threadPool;
+ private Updater[] updaters;
+
+ public int getNumTerms() {
+ return numTerms;
+ }
+
+ public int getNumTopics() {
+ return numTopics;
+ }
+
+ public TopicModel(int numTopics, int numTerms, double eta, double alpha, String[] dictionary,
+ double modelWeight) {
+ this(numTopics, numTerms, eta, alpha, null, dictionary, 1, modelWeight);
+ }
+
+ public TopicModel(Configuration conf, double eta, double alpha,
+ String[] dictionary, int numThreads, double modelWeight, Path... modelpath) throws IOException {
+ this(loadModel(conf, modelpath), eta, alpha, dictionary, numThreads, modelWeight);
+ }
+
+ public TopicModel(int numTopics, int numTerms, double eta, double alpha, String[] dictionary,
+ int numThreads, double modelWeight) {
+ this(new DenseMatrix(numTopics, numTerms), new DenseVector(numTopics), eta, alpha, dictionary,
+ numThreads, modelWeight);
+ }
+
+ public TopicModel(int numTopics, int numTerms, double eta, double alpha, Random random,
+ String[] dictionary, int numThreads, double modelWeight) {
+ this(randomMatrix(numTopics, numTerms, random), eta, alpha, dictionary, numThreads, modelWeight);
+ }
+
+ private TopicModel(Pair<Matrix, Vector> model, double eta, double alpha, String[] dict,
+ int numThreads, double modelWeight) {
+ this(model.getFirst(), model.getSecond(), eta, alpha, dict, numThreads, modelWeight);
+ }
+
+ public TopicModel(Matrix topicTermCounts, Vector topicSums, double eta, double alpha,
+ String[] dictionary, double modelWeight) {
+ this(topicTermCounts, topicSums, eta, alpha, dictionary, 1, modelWeight);
+ }
+
+ public TopicModel(Matrix topicTermCounts, double eta, double alpha, String[] dictionary,
+ int numThreads, double modelWeight) {
+ this(topicTermCounts, viewRowSums(topicTermCounts),
+ eta, alpha, dictionary, numThreads, modelWeight);
+ }
+
+ public TopicModel(Matrix topicTermCounts, Vector topicSums, double eta, double alpha,
+ String[] dictionary, int numThreads, double modelWeight) {
+ this.dictionary = dictionary;
+ this.topicTermCounts = topicTermCounts;
+ this.topicSums = topicSums;
+ this.numTopics = topicSums.size();
+ this.numTerms = topicTermCounts.numCols();
+ this.eta = eta;
+ this.alpha = alpha;
+ this.sampler = new Sampler(RandomUtils.getRandom());
+ this.numThreads = numThreads;
+ if (modelWeight != 1) {
+ topicSums.assign(Functions.mult(modelWeight));
+ for (int x = 0; x < numTopics; x++) {
+ topicTermCounts.viewRow(x).assign(Functions.mult(modelWeight));
+ }
+ }
+ initializeThreadPool();
+ }
+
+ private static Vector viewRowSums(Matrix m) {
+ Vector v = new DenseVector(m.numRows());
+ for (MatrixSlice slice : m) {
+ v.set(slice.index(), slice.vector().norm(1));
+ }
+ return v;
+ }
+
+ private synchronized void initializeThreadPool() {
+ if (threadPool != null) {
+ threadPool.shutdown();
+ try {
+ threadPool.awaitTermination(100, TimeUnit.SECONDS);
+ } catch (InterruptedException e) {
+ log.error("Could not terminate all threads for TopicModel in time.", e);
+ }
+ }
+ threadPool = new ThreadPoolExecutor(numThreads, numThreads, 0, TimeUnit.SECONDS,
+ new ArrayBlockingQueue<Runnable>(numThreads * 10));
+ threadPool.allowCoreThreadTimeOut(false);
+ updaters = new Updater[numThreads];
+ for (int i = 0; i < numThreads; i++) {
+ updaters[i] = new Updater();
+ threadPool.submit(updaters[i]);
+ }
+ }
+
+ Matrix topicTermCounts() {
+ return topicTermCounts;
+ }
+
+ @Override
+ public Iterator<MatrixSlice> iterator() {
+ return topicTermCounts.iterateAll();
+ }
+
+ public Vector topicSums() {
+ return topicSums;
+ }
+
+ private static Pair<Matrix,Vector> randomMatrix(int numTopics, int numTerms, Random random) {
+ Matrix topicTermCounts = new DenseMatrix(numTopics, numTerms);
+ Vector topicSums = new DenseVector(numTopics);
+ if (random != null) {
+ for (int x = 0; x < numTopics; x++) {
+ for (int term = 0; term < numTerms; term++) {
+ topicTermCounts.viewRow(x).set(term, random.nextDouble());
+ }
+ }
+ }
+ for (int x = 0; x < numTopics; x++) {
+ topicSums.set(x, random == null ? 1.0 : topicTermCounts.viewRow(x).norm(1));
+ }
+ return Pair.of(topicTermCounts, topicSums);
+ }
+
+ public static Pair<Matrix, Vector> loadModel(Configuration conf, Path... modelPaths)
+ throws IOException {
+ int numTopics = -1;
+ int numTerms = -1;
+ List<Pair<Integer, Vector>> rows = new ArrayList<>();
+ for (Path modelPath : modelPaths) {
+ for (Pair<IntWritable, VectorWritable> row
+ : new SequenceFileIterable<IntWritable, VectorWritable>(modelPath, true, conf)) {
+ rows.add(Pair.of(row.getFirst().get(), row.getSecond().get()));
+ numTopics = Math.max(numTopics, row.getFirst().get());
+ if (numTerms < 0) {
+ numTerms = row.getSecond().get().size();
+ }
+ }
+ }
+ if (rows.isEmpty()) {
+ throw new IOException(Arrays.toString(modelPaths) + " have no vectors in it");
+ }
+ numTopics++;
+ Matrix model = new DenseMatrix(numTopics, numTerms);
+ Vector topicSums = new DenseVector(numTopics);
+ for (Pair<Integer, Vector> pair : rows) {
+ model.viewRow(pair.getFirst()).assign(pair.getSecond());
+ topicSums.set(pair.getFirst(), pair.getSecond().norm(1));
+ }
+ return Pair.of(model, topicSums);
+ }
+
+ // NOTE: this is purely for debug purposes. It is not performant to "toString()" a real model
+ @Override
+ public String toString() {
+ StringBuilder buf = new StringBuilder();
+ for (int x = 0; x < numTopics; x++) {
+ String v = dictionary != null
+ ? vectorToSortedString(topicTermCounts.viewRow(x).normalize(1), dictionary)
+ : topicTermCounts.viewRow(x).asFormatString();
+ buf.append(v).append('\n');
+ }
+ return buf.toString();
+ }
+
+ public int sampleTerm(Vector topicDistribution) {
+ return sampler.sample(topicTermCounts.viewRow(sampler.sample(topicDistribution)));
+ }
+
+ public int sampleTerm(int topic) {
+ return sampler.sample(topicTermCounts.viewRow(topic));
+ }
+
+ public synchronized void reset() {
+ for (int x = 0; x < numTopics; x++) {
+ topicTermCounts.assignRow(x, new SequentialAccessSparseVector(numTerms));
+ }
+ topicSums.assign(1.0);
+ if (threadPool.isTerminated()) {
+ initializeThreadPool();
+ }
+ }
+
+ public synchronized void stop() {
+ for (Updater updater : updaters) {
+ updater.shutdown();
+ }
+ threadPool.shutdown();
+ try {
+ if (!threadPool.awaitTermination(60, TimeUnit.SECONDS)) {
+ log.warn("Threadpool timed out on await termination - jobs still running!");
+ }
+ } catch (InterruptedException e) {
+ log.error("Interrupted shutting down!", e);
+ }
+ }
+
+ public void renormalize() {
+ for (int x = 0; x < numTopics; x++) {
+ topicTermCounts.assignRow(x, topicTermCounts.viewRow(x).normalize(1));
+ topicSums.assign(1.0);
+ }
+ }
+
+ public void trainDocTopicModel(Vector original, Vector topics, Matrix docTopicModel) {
+ // first calculate p(topic|term,document) for all terms in original, and all topics,
+ // using p(term|topic) and p(topic|doc)
+ pTopicGivenTerm(original, topics, docTopicModel);
+ normalizeByTopic(docTopicModel);
+ // now multiply, term-by-term, by the document, to get the weighted distribution of
+ // term-topic pairs from this document.
+ for (Element e : original.nonZeroes()) {
+ for (int x = 0; x < numTopics; x++) {
+ Vector docTopicModelRow = docTopicModel.viewRow(x);
+ docTopicModelRow.setQuick(e.index(), docTopicModelRow.getQuick(e.index()) * e.get());
+ }
+ }
+ // now recalculate \(p(topic|doc)\) by summing contributions from all of pTopicGivenTerm
+ topics.assign(0.0);
+ for (int x = 0; x < numTopics; x++) {
+ topics.set(x, docTopicModel.viewRow(x).norm(1));
+ }
+ // now renormalize so that \(sum_x(p(x|doc))\) = 1
+ topics.assign(Functions.mult(1 / topics.norm(1)));
+ }
+
+ public Vector infer(Vector original, Vector docTopics) {
+ Vector pTerm = original.like();
+ for (Element e : original.nonZeroes()) {
+ int term = e.index();
+ // p(a) = sum_x (p(a|x) * p(x|i))
+ double pA = 0;
+ for (int x = 0; x < numTopics; x++) {
+ pA += (topicTermCounts.viewRow(x).get(term) / topicSums.get(x)) * docTopics.get(x);
+ }
+ pTerm.set(term, pA);
+ }
+ return pTerm;
+ }
+
+ public void update(Matrix docTopicCounts) {
+ for (int x = 0; x < numTopics; x++) {
+ updaters[x % updaters.length].update(x, docTopicCounts.viewRow(x));
+ }
+ }
+
+ public void updateTopic(int topic, Vector docTopicCounts) {
+ topicTermCounts.viewRow(topic).assign(docTopicCounts, Functions.PLUS);
+ topicSums.set(topic, topicSums.get(topic) + docTopicCounts.norm(1));
+ }
+
+ public void update(int termId, Vector topicCounts) {
+ for (int x = 0; x < numTopics; x++) {
+ Vector v = topicTermCounts.viewRow(x);
+ v.set(termId, v.get(termId) + topicCounts.get(x));
+ }
+ topicSums.assign(topicCounts, Functions.PLUS);
+ }
+
+ public void persist(Path outputDir, boolean overwrite) throws IOException {
+ FileSystem fs = outputDir.getFileSystem(conf);
+ if (overwrite) {
+ fs.delete(outputDir, true); // CHECK second arg
+ }
+ DistributedRowMatrixWriter.write(outputDir, conf, topicTermCounts);
+ }
+
+ /**
+ * Computes {@code \(p(topic x | term a, document i)\)} distributions given input document {@code i}.
+ * {@code \(pTGT[x][a]\)} is the (un-normalized) {@code \(p(x|a,i)\)}, or if docTopics is {@code null},
+ * {@code \(p(a|x)\)} (also un-normalized).
+ *
+ * @param document doc-term vector encoding {@code \(w(term a|document i)\)}.
+ * @param docTopics {@code docTopics[x]} is the overall weight of topic {@code x} in given
+ * document. If {@code null}, a topic weight of {@code 1.0} is used for all topics.
+ * @param termTopicDist storage for output {@code \(p(x|a,i)\)} distributions.
+ */
+ private void pTopicGivenTerm(Vector document, Vector docTopics, Matrix termTopicDist) {
+ // for each topic x
+ for (int x = 0; x < numTopics; x++) {
+ // get p(topic x | document i), or 1.0 if docTopics is null
+ double topicWeight = docTopics == null ? 1.0 : docTopics.get(x);
+ // get w(term a | topic x)
+ Vector topicTermRow = topicTermCounts.viewRow(x);
+ // get \sum_a w(term a | topic x)
+ double topicSum = topicSums.get(x);
+ // get p(topic x | term a) distribution to update
+ Vector termTopicRow = termTopicDist.viewRow(x);
+
+ // for each term a in document i with non-zero weight
+ for (Element e : document.nonZeroes()) {
+ int termIndex = e.index();
+
+ // calc un-normalized p(topic x | term a, document i)
+ double termTopicLikelihood = (topicTermRow.get(termIndex) + eta) * (topicWeight + alpha)
+ / (topicSum + eta * numTerms);
+ termTopicRow.set(termIndex, termTopicLikelihood);
+ }
+ }
+ }
+
+ /**
+ * \(sum_x sum_a (c_ai * log(p(x|i) * p(a|x)))\)
+ */
+ public double perplexity(Vector document, Vector docTopics) {
+ double perplexity = 0;
+ double norm = docTopics.norm(1) + (docTopics.size() * alpha);
+ for (Element e : document.nonZeroes()) {
+ int term = e.index();
+ double prob = 0;
+ for (int x = 0; x < numTopics; x++) {
+ double d = (docTopics.get(x) + alpha) / norm;
+ double p = d * (topicTermCounts.viewRow(x).get(term) + eta)
+ / (topicSums.get(x) + eta * numTerms);
+ prob += p;
+ }
+ perplexity += e.get() * Math.log(prob);
+ }
+ return -perplexity;
+ }
+
+ private void normalizeByTopic(Matrix perTopicSparseDistributions) {
+ // then make sure that each of these is properly normalized by topic: sum_x(p(x|t,d)) = 1
+ for (Element e : perTopicSparseDistributions.viewRow(0).nonZeroes()) {
+ int a = e.index();
+ double sum = 0;
+ for (int x = 0; x < numTopics; x++) {
+ sum += perTopicSparseDistributions.viewRow(x).get(a);
+ }
+ for (int x = 0; x < numTopics; x++) {
+ perTopicSparseDistributions.viewRow(x).set(a,
+ perTopicSparseDistributions.viewRow(x).get(a) / sum);
+ }
+ }
+ }
+
+ public static String vectorToSortedString(Vector vector, String[] dictionary) {
+ List<Pair<String,Double>> vectorValues = new ArrayList<>(vector.getNumNondefaultElements());
+ for (Element e : vector.nonZeroes()) {
+ vectorValues.add(Pair.of(dictionary != null ? dictionary[e.index()] : String.valueOf(e.index()),
+ e.get()));
+ }
+ Collections.sort(vectorValues, new Comparator<Pair<String, Double>>() {
+ @Override public int compare(Pair<String, Double> x, Pair<String, Double> y) {
+ return y.getSecond().compareTo(x.getSecond());
+ }
+ });
+ Iterator<Pair<String,Double>> listIt = vectorValues.iterator();
+ StringBuilder bldr = new StringBuilder(2048);
+ bldr.append('{');
+ int i = 0;
+ while (listIt.hasNext() && i < 25) {
+ i++;
+ Pair<String,Double> p = listIt.next();
+ bldr.append(p.getFirst());
+ bldr.append(':');
+ bldr.append(p.getSecond());
+ bldr.append(',');
+ }
+ if (bldr.length() > 1) {
+ bldr.setCharAt(bldr.length() - 1, '}');
+ }
+ return bldr.toString();
+ }
+
+ @Override
+ public void setConf(Configuration configuration) {
+ this.conf = configuration;
+ }
+
+ @Override
+ public Configuration getConf() {
+ return conf;
+ }
+
+ private final class Updater implements Runnable {
+ private final ArrayBlockingQueue<Pair<Integer, Vector>> queue =
+ new ArrayBlockingQueue<>(100);
+ private boolean shutdown = false;
+ private boolean shutdownComplete = false;
+
+ public void shutdown() {
+ try {
+ synchronized (this) {
+ while (!shutdownComplete) {
+ shutdown = true;
+ wait(10000L); // Arbitrarily, wait 10 seconds rather than forever for this
+ }
+ }
+ } catch (InterruptedException e) {
+ log.warn("Interrupted waiting to shutdown() : ", e);
+ }
+ }
+
+ public boolean update(int topic, Vector v) {
+ if (shutdown) { // maybe don't do this?
+ throw new IllegalStateException("In SHUTDOWN state: cannot submit tasks");
+ }
+ while (true) { // keep trying if interrupted
+ try {
+ // start async operation by submitting to the queue
+ queue.put(Pair.of(topic, v));
+ // return once you got access to the queue
+ return true;
+ } catch (InterruptedException e) {
+ log.warn("Interrupted trying to queue update:", e);
+ }
+ }
+ }
+
+ @Override
+ public void run() {
+ while (!shutdown) {
+ try {
+ Pair<Integer, Vector> pair = queue.poll(1, TimeUnit.SECONDS);
+ if (pair != null) {
+ updateTopic(pair.getFirst(), pair.getSecond());
+ }
+ } catch (InterruptedException e) {
+ log.warn("Interrupted waiting to poll for update", e);
+ }
+ }
+ // in shutdown mode, finish remaining tasks!
+ for (Pair<Integer, Vector> pair : queue) {
+ updateTopic(pair.getFirst(), pair.getSecond());
+ }
+ synchronized (this) {
+ shutdownComplete = true;
+ notifyAll();
+ }
+ }
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/package-info.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/package-info.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/package-info.java
new file mode 100644
index 0000000..9926b91
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/package-info.java
@@ -0,0 +1,13 @@
+/**
+ * <p></p>This package provides several clustering algorithm implementations. Clustering usually groups a set of
+ * objects into groups of similar items. The definition of similarity usually is up to you - for text documents,
+ * cosine-distance/-similarity is recommended. Mahout also features other types of distance measure like
+ * Euclidean distance.</p>
+ *
+ * <p></p>Input of each clustering algorithm is a set of vectors representing your items. For texts in general these are
+ * <a href="http://en.wikipedia.org/wiki/TFIDF">TFIDF</a> or
+ * <a href="http://en.wikipedia.org/wiki/Bag_of_words">Bag of words</a> representations of the documents.</p>
+ *
+ * <p>Output of each clustering algorithm is either a hard or soft assignment of items to clusters.</p>
+ */
+package org.apache.mahout.clustering;

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/AffinityMatrixInputJob.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/AffinityMatrixInputJob.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/AffinityMatrixInputJob.java
new file mode 100644
index 0000000..aa12b9e
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/AffinityMatrixInputJob.java
@@ -0,0 +1,84 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.spectral;
+
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.math.VectorWritable;
+import org.apache.mahout.math.hadoop.DistributedRowMatrix;
+
+public final class AffinityMatrixInputJob {
+
+ private AffinityMatrixInputJob() {
+ }
+
+ /**
+ * Initializes and executes the job of reading the documents containing
+ * the data of the affinity matrix in (x_i, x_j, value) format.
+ */
+ public static void runJob(Path input, Path output, int rows, int cols)
+ throws IOException, InterruptedException, ClassNotFoundException {
+ Configuration conf = new Configuration();
+ HadoopUtil.delete(conf, output);
+
+ conf.setInt(Keys.AFFINITY_DIMENSIONS, rows);
+ Job job = new Job(conf, "AffinityMatrixInputJob: " + input + " -> M/R -> " + output);
+
+ job.setMapOutputKeyClass(IntWritable.class);
+ job.setMapOutputValueClass(DistributedRowMatrix.MatrixEntryWritable.class);
+ job.setOutputKeyClass(IntWritable.class);
+ job.setOutputValueClass(VectorWritable.class);
+ job.setOutputFormatClass(SequenceFileOutputFormat.class);
+ job.setMapperClass(AffinityMatrixInputMapper.class);
+ job.setReducerClass(AffinityMatrixInputReducer.class);
+
+ FileInputFormat.addInputPath(job, input);
+ FileOutputFormat.setOutputPath(job, output);
+
+ job.setJarByClass(AffinityMatrixInputJob.class);
+
+ boolean succeeded = job.waitForCompletion(true);
+ if (!succeeded) {
+ throw new IllegalStateException("Job failed!");
+ }
+ }
+
+ /**
+ * A transparent wrapper for the above method which handles the tedious tasks
+ * of setting and retrieving system Paths. Hands back a fully-populated
+ * and initialized DistributedRowMatrix.
+ */
+ public static DistributedRowMatrix runJob(Path input, Path output, int dimensions)
+ throws IOException, InterruptedException, ClassNotFoundException {
+ Path seqFiles = new Path(output, "seqfiles-" + (System.nanoTime() & 0xFF));
+ runJob(input, seqFiles, dimensions, dimensions);
+ DistributedRowMatrix a = new DistributedRowMatrix(seqFiles,
+ new Path(seqFiles, "seqtmp-" + (System.nanoTime() & 0xFF)),
+ dimensions, dimensions);
+ a.setConf(new Configuration());
+ return a;
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/AffinityMatrixInputMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/AffinityMatrixInputMapper.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/AffinityMatrixInputMapper.java
new file mode 100644
index 0000000..30d2404
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/AffinityMatrixInputMapper.java
@@ -0,0 +1,78 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.spectral;
+
+import java.io.IOException;
+import java.util.regex.Pattern;
+
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.math.hadoop.DistributedRowMatrix;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * <p>Handles reading the files representing the affinity matrix. Since the affinity
+ * matrix is representative of a graph, each line in all the files should
+ * take the form:</p>
+ *
+ * {@code i,j,value}
+ *
+ * <p>where {@code i} and {@code j} are the {@code i}th and
+ * {@code j} data points in the entire set, and {@code value}
+ * represents some measurement of their relative absolute magnitudes. This
+ * is, simply, a method for representing a graph textually.
+ */
+public class AffinityMatrixInputMapper
+ extends Mapper<LongWritable, Text, IntWritable, DistributedRowMatrix.MatrixEntryWritable> {
+
+ private static final Logger log = LoggerFactory.getLogger(AffinityMatrixInputMapper.class);
+
+ private static final Pattern COMMA_PATTERN = Pattern.compile(",");
+
+ @Override
+ protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
+
+ String[] elements = COMMA_PATTERN.split(value.toString());
+ log.debug("(DEBUG - MAP) Key[{}], Value[{}]", key.get(), value);
+
+ // enforce well-formed textual representation of the graph
+ if (elements.length != 3) {
+ throw new IOException("Expected input of length 3, received "
+ + elements.length + ". Please make sure you adhere to "
+ + "the structure of (i,j,value) for representing a graph in text. "
+ + "Input line was: '" + value + "'.");
+ }
+ if (elements[0].isEmpty() || elements[1].isEmpty() || elements[2].isEmpty()) {
+ throw new IOException("Found an element of 0 length. Please be sure you adhere to the structure of "
+ + "(i,j,value) for representing a graph in text.");
+ }
+
+ // parse the line of text into a DistributedRowMatrix entry,
+ // making the row (elements[0]) the key to the Reducer, and
+ // setting the column (elements[1]) in the entry itself
+ DistributedRowMatrix.MatrixEntryWritable toAdd = new DistributedRowMatrix.MatrixEntryWritable();
+ IntWritable row = new IntWritable(Integer.valueOf(elements[0]));
+ toAdd.setRow(-1); // already set as the Reducer's key
+ toAdd.setCol(Integer.valueOf(elements[1]));
+ toAdd.setVal(Double.valueOf(elements[2]));
+ context.write(row, toAdd);
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/AffinityMatrixInputReducer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/AffinityMatrixInputReducer.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/AffinityMatrixInputReducer.java
new file mode 100644
index 0000000..d892969
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/AffinityMatrixInputReducer.java
@@ -0,0 +1,59 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.spectral;
+
+import java.io.IOException;
+
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.mahout.math.RandomAccessSparseVector;
+import org.apache.mahout.math.SequentialAccessSparseVector;
+import org.apache.mahout.math.VectorWritable;
+import org.apache.mahout.math.hadoop.DistributedRowMatrix;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Tasked with taking each DistributedRowMatrix entry and collecting them
+ * into vectors corresponding to rows. The input and output keys are the same,
+ * corresponding to the row in the ensuing matrix. The matrix entries are
+ * entered into a vector according to the column to which they belong, and
+ * the vector is then given the key corresponding to its row.
+ */
+public class AffinityMatrixInputReducer
+ extends Reducer<IntWritable, DistributedRowMatrix.MatrixEntryWritable, IntWritable, VectorWritable> {
+
+ private static final Logger log = LoggerFactory.getLogger(AffinityMatrixInputReducer.class);
+
+ @Override
+ protected void reduce(IntWritable row, Iterable<DistributedRowMatrix.MatrixEntryWritable> values, Context context)
+ throws IOException, InterruptedException {
+ int size = context.getConfiguration().getInt(Keys.AFFINITY_DIMENSIONS, Integer.MAX_VALUE);
+ RandomAccessSparseVector out = new RandomAccessSparseVector(size, 100);
+
+ for (DistributedRowMatrix.MatrixEntryWritable element : values) {
+ out.setQuick(element.getCol(), element.getVal());
+ if (log.isDebugEnabled()) {
+ log.debug("(DEBUG - REDUCE) Row[{}], Column[{}], Value[{}]",
+ row.get(), element.getCol(), element.getVal());
+ }
+ }
+ SequentialAccessSparseVector output = new SequentialAccessSparseVector(out);
+ context.write(row, new VectorWritable(output));
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/IntDoublePairWritable.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/IntDoublePairWritable.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/IntDoublePairWritable.java
new file mode 100644
index 0000000..593cc58
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/IntDoublePairWritable.java
@@ -0,0 +1,75 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.spectral;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.hadoop.io.Writable;
+
+/**
+ * This class is a Writable implementation of the mahout.common.Pair
+ * generic class. Since the generic types would also themselves have to
+ * implement Writable, it made more sense to create a more specialized
+ * version of the class altogether.
+ *
+ * In essence, this can be treated as a single Vector Element.
+ */
+public class IntDoublePairWritable implements Writable {
+
+ private int key;
+ private double value;
+
+ public IntDoublePairWritable() {
+ }
+
+ public IntDoublePairWritable(int k, double v) {
+ this.key = k;
+ this.value = v;
+ }
+
+ public void setKey(int k) {
+ this.key = k;
+ }
+
+ public void setValue(double v) {
+ this.value = v;
+ }
+
+ @Override
+ public void readFields(DataInput in) throws IOException {
+ this.key = in.readInt();
+ this.value = in.readDouble();
+ }
+
+ @Override
+ public void write(DataOutput out) throws IOException {
+ out.writeInt(key);
+ out.writeDouble(value);
+ }
+
+ public int getKey() {
+ return key;
+ }
+
+ public double getValue() {
+ return value;
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/Keys.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/Keys.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/Keys.java
new file mode 100644
index 0000000..268a365
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/Keys.java
@@ -0,0 +1,31 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.spectral;
+
+public class Keys {
+
+ /**
+ * Sets the SequenceFile index for the diagonal matrix.
+ */
+ public static final int DIAGONAL_CACHE_INDEX = 1;
+
+ public static final String AFFINITY_DIMENSIONS = "org.apache.mahout.clustering.spectral.common.affinitydimensions";
+
+ private Keys() {}
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/MatrixDiagonalizeJob.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/MatrixDiagonalizeJob.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/MatrixDiagonalizeJob.java
new file mode 100644
index 0000000..f245f99
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/MatrixDiagonalizeJob.java
@@ -0,0 +1,108 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.spectral;
+
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+
+/**
+ * Given a matrix, this job returns a vector whose i_th element is the
+ * sum of all the elements in the i_th row of the original matrix.
+ */
+public final class MatrixDiagonalizeJob {
+
+ private MatrixDiagonalizeJob() {
+ }
+
+ public static Vector runJob(Path affInput, int dimensions)
+ throws IOException, ClassNotFoundException, InterruptedException {
+
+ // set up all the job tasks
+ Configuration conf = new Configuration();
+ Path diagOutput = new Path(affInput.getParent(), "diagonal");
+ HadoopUtil.delete(conf, diagOutput);
+ conf.setInt(Keys.AFFINITY_DIMENSIONS, dimensions);
+ Job job = new Job(conf, "MatrixDiagonalizeJob");
+
+ job.setInputFormatClass(SequenceFileInputFormat.class);
+ job.setMapOutputKeyClass(NullWritable.class);
+ job.setMapOutputValueClass(IntDoublePairWritable.class);
+ job.setOutputKeyClass(NullWritable.class);
+ job.setOutputValueClass(VectorWritable.class);
+ job.setOutputFormatClass(SequenceFileOutputFormat.class);
+ job.setMapperClass(MatrixDiagonalizeMapper.class);
+ job.setReducerClass(MatrixDiagonalizeReducer.class);
+
+ FileInputFormat.addInputPath(job, affInput);
+ FileOutputFormat.setOutputPath(job, diagOutput);
+
+ job.setJarByClass(MatrixDiagonalizeJob.class);
+
+ boolean succeeded = job.waitForCompletion(true);
+ if (!succeeded) {
+ throw new IllegalStateException("Job failed!");
+ }
+
+ // read the results back from the path
+ return VectorCache.load(conf, new Path(diagOutput, "part-r-00000"));
+ }
+
+ public static class MatrixDiagonalizeMapper
+ extends Mapper<IntWritable, VectorWritable, NullWritable, IntDoublePairWritable> {
+
+ @Override
+ protected void map(IntWritable key, VectorWritable row, Context context)
+ throws IOException, InterruptedException {
+ // store the sum
+ IntDoublePairWritable store = new IntDoublePairWritable(key.get(), row.get().zSum());
+ context.write(NullWritable.get(), store);
+ }
+ }
+
+ public static class MatrixDiagonalizeReducer
+ extends Reducer<NullWritable, IntDoublePairWritable, NullWritable, VectorWritable> {
+
+ @Override
+ protected void reduce(NullWritable key, Iterable<IntDoublePairWritable> values,
+ Context context) throws IOException, InterruptedException {
+ // create the return vector
+ Vector retval = new DenseVector(context.getConfiguration().getInt(Keys.AFFINITY_DIMENSIONS, Integer.MAX_VALUE));
+ // put everything in its correct spot
+ for (IntDoublePairWritable e : values) {
+ retval.setQuick(e.getKey(), e.getValue());
+ }
+ // write it out
+ context.write(key, new VectorWritable(retval));
+ }
+ }
+}
r***@apache.org
2018-06-28 14:54:39 UTC
Permalink
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/HmmTrainer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/HmmTrainer.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/HmmTrainer.java
new file mode 100644
index 0000000..a1cd3e0
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/HmmTrainer.java
@@ -0,0 +1,488 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sequencelearning.hmm;
+
+import java.util.Collection;
+import java.util.Iterator;
+
+import org.apache.mahout.math.DenseMatrix;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.Matrix;
+import org.apache.mahout.math.Vector;
+
+/**
+ * Class containing several algorithms used to train a Hidden Markov Model. The
+ * three main algorithms are: supervised learning, unsupervised Viterbi and
+ * unsupervised Baum-Welch.
+ */
+public final class HmmTrainer {
+
+ /**
+ * No public constructor for utility classes.
+ */
+ private HmmTrainer() {
+ // nothing to do here really.
+ }
+
+ /**
+ * Create an supervised initial estimate of an HMM Model based on a sequence
+ * of observed and hidden states.
+ *
+ * @param nrOfHiddenStates The total number of hidden states
+ * @param nrOfOutputStates The total number of output states
+ * @param observedSequence Integer array containing the observed sequence
+ * @param hiddenSequence Integer array containing the hidden sequence
+ * @param pseudoCount Value that is assigned to non-occurring transitions to avoid zero
+ * probabilities.
+ * @return An initial model using the estimated parameters
+ */
+ public static HmmModel trainSupervised(int nrOfHiddenStates, int nrOfOutputStates, int[] observedSequence,
+ int[] hiddenSequence, double pseudoCount) {
+ // make sure the pseudo count is not zero
+ pseudoCount = pseudoCount == 0 ? Double.MIN_VALUE : pseudoCount;
+
+ // initialize the parameters
+ DenseMatrix transitionMatrix = new DenseMatrix(nrOfHiddenStates, nrOfHiddenStates);
+ DenseMatrix emissionMatrix = new DenseMatrix(nrOfHiddenStates, nrOfOutputStates);
+ // assign a small initial probability that is larger than zero, so
+ // unseen states will not get a zero probability
+ transitionMatrix.assign(pseudoCount);
+ emissionMatrix.assign(pseudoCount);
+ // given no prior knowledge, we have to assume that all initial hidden
+ // states are equally likely
+ DenseVector initialProbabilities = new DenseVector(nrOfHiddenStates);
+ initialProbabilities.assign(1.0 / nrOfHiddenStates);
+
+ // now loop over the sequences to count the number of transitions
+ countTransitions(transitionMatrix, emissionMatrix, observedSequence,
+ hiddenSequence);
+
+ // make sure that probabilities are normalized
+ for (int i = 0; i < nrOfHiddenStates; i++) {
+ // compute sum of probabilities for current row of transition matrix
+ double sum = 0;
+ for (int j = 0; j < nrOfHiddenStates; j++) {
+ sum += transitionMatrix.getQuick(i, j);
+ }
+ // normalize current row of transition matrix
+ for (int j = 0; j < nrOfHiddenStates; j++) {
+ transitionMatrix.setQuick(i, j, transitionMatrix.getQuick(i, j) / sum);
+ }
+ // compute sum of probabilities for current row of emission matrix
+ sum = 0;
+ for (int j = 0; j < nrOfOutputStates; j++) {
+ sum += emissionMatrix.getQuick(i, j);
+ }
+ // normalize current row of emission matrix
+ for (int j = 0; j < nrOfOutputStates; j++) {
+ emissionMatrix.setQuick(i, j, emissionMatrix.getQuick(i, j) / sum);
+ }
+ }
+
+ // return a new model using the parameter estimations
+ return new HmmModel(transitionMatrix, emissionMatrix, initialProbabilities);
+ }
+
+ /**
+ * Function that counts the number of state->state and state->output
+ * transitions for the given observed/hidden sequence.
+ *
+ * @param transitionMatrix transition matrix to use.
+ * @param emissionMatrix emission matrix to use for counting.
+ * @param observedSequence observation sequence to use.
+ * @param hiddenSequence sequence of hidden states to use.
+ */
+ private static void countTransitions(Matrix transitionMatrix,
+ Matrix emissionMatrix, int[] observedSequence, int[] hiddenSequence) {
+ emissionMatrix.setQuick(hiddenSequence[0], observedSequence[0],
+ emissionMatrix.getQuick(hiddenSequence[0], observedSequence[0]) + 1);
+ for (int i = 1; i < observedSequence.length; ++i) {
+ transitionMatrix
+ .setQuick(hiddenSequence[i - 1], hiddenSequence[i], transitionMatrix
+ .getQuick(hiddenSequence[i - 1], hiddenSequence[i]) + 1);
+ emissionMatrix.setQuick(hiddenSequence[i], observedSequence[i],
+ emissionMatrix.getQuick(hiddenSequence[i], observedSequence[i]) + 1);
+ }
+ }
+
+ /**
+ * Create an supervised initial estimate of an HMM Model based on a number of
+ * sequences of observed and hidden states.
+ *
+ * @param nrOfHiddenStates The total number of hidden states
+ * @param nrOfOutputStates The total number of output states
+ * @param hiddenSequences Collection of hidden sequences to use for training
+ * @param observedSequences Collection of observed sequences to use for training associated with hidden sequences.
+ * @param pseudoCount Value that is assigned to non-occurring transitions to avoid zero
+ * probabilities.
+ * @return An initial model using the estimated parameters
+ */
+ public static HmmModel trainSupervisedSequence(int nrOfHiddenStates,
+ int nrOfOutputStates, Collection<int[]> hiddenSequences,
+ Collection<int[]> observedSequences, double pseudoCount) {
+
+ // make sure the pseudo count is not zero
+ pseudoCount = pseudoCount == 0 ? Double.MIN_VALUE : pseudoCount;
+
+ // initialize parameters
+ DenseMatrix transitionMatrix = new DenseMatrix(nrOfHiddenStates,
+ nrOfHiddenStates);
+ DenseMatrix emissionMatrix = new DenseMatrix(nrOfHiddenStates,
+ nrOfOutputStates);
+ DenseVector initialProbabilities = new DenseVector(nrOfHiddenStates);
+
+ // assign pseudo count to avoid zero probabilities
+ transitionMatrix.assign(pseudoCount);
+ emissionMatrix.assign(pseudoCount);
+ initialProbabilities.assign(pseudoCount);
+
+ // now loop over the sequences to count the number of transitions
+ Iterator<int[]> hiddenSequenceIt = hiddenSequences.iterator();
+ Iterator<int[]> observedSequenceIt = observedSequences.iterator();
+ while (hiddenSequenceIt.hasNext() && observedSequenceIt.hasNext()) {
+ // fetch the current set of sequences
+ int[] hiddenSequence = hiddenSequenceIt.next();
+ int[] observedSequence = observedSequenceIt.next();
+ // increase the count for initial probabilities
+ initialProbabilities.setQuick(hiddenSequence[0], initialProbabilities
+ .getQuick(hiddenSequence[0]) + 1);
+ countTransitions(transitionMatrix, emissionMatrix, observedSequence,
+ hiddenSequence);
+ }
+
+ // make sure that probabilities are normalized
+ double isum = 0; // sum of initial probabilities
+ for (int i = 0; i < nrOfHiddenStates; i++) {
+ isum += initialProbabilities.getQuick(i);
+ // compute sum of probabilities for current row of transition matrix
+ double sum = 0;
+ for (int j = 0; j < nrOfHiddenStates; j++) {
+ sum += transitionMatrix.getQuick(i, j);
+ }
+ // normalize current row of transition matrix
+ for (int j = 0; j < nrOfHiddenStates; j++) {
+ transitionMatrix.setQuick(i, j, transitionMatrix.getQuick(i, j) / sum);
+ }
+ // compute sum of probabilities for current row of emission matrix
+ sum = 0;
+ for (int j = 0; j < nrOfOutputStates; j++) {
+ sum += emissionMatrix.getQuick(i, j);
+ }
+ // normalize current row of emission matrix
+ for (int j = 0; j < nrOfOutputStates; j++) {
+ emissionMatrix.setQuick(i, j, emissionMatrix.getQuick(i, j) / sum);
+ }
+ }
+ // normalize the initial probabilities
+ for (int i = 0; i < nrOfHiddenStates; ++i) {
+ initialProbabilities.setQuick(i, initialProbabilities.getQuick(i) / isum);
+ }
+
+ // return a new model using the parameter estimates
+ return new HmmModel(transitionMatrix, emissionMatrix, initialProbabilities);
+ }
+
+ /**
+ * Iteratively train the parameters of the given initial model wrt to the
+ * observed sequence using Viterbi training.
+ *
+ * @param initialModel The initial model that gets iterated
+ * @param observedSequence The sequence of observed states
+ * @param pseudoCount Value that is assigned to non-occurring transitions to avoid zero
+ * probabilities.
+ * @param epsilon Convergence criteria
+ * @param maxIterations The maximum number of training iterations
+ * @param scaled Use Log-scaled implementation, this is computationally more
+ * expensive but offers better numerical stability for large observed
+ * sequences
+ * @return The iterated model
+ */
+ public static HmmModel trainViterbi(HmmModel initialModel,
+ int[] observedSequence, double pseudoCount, double epsilon,
+ int maxIterations, boolean scaled) {
+
+ // make sure the pseudo count is not zero
+ pseudoCount = pseudoCount == 0 ? Double.MIN_VALUE : pseudoCount;
+
+ // allocate space for iteration models
+ HmmModel lastIteration = initialModel.clone();
+ HmmModel iteration = initialModel.clone();
+
+ // allocate space for Viterbi path calculation
+ int[] viterbiPath = new int[observedSequence.length];
+ int[][] phi = new int[observedSequence.length - 1][initialModel
+ .getNrOfHiddenStates()];
+ double[][] delta = new double[observedSequence.length][initialModel
+ .getNrOfHiddenStates()];
+
+ // now run the Viterbi training iteration
+ for (int i = 0; i < maxIterations; ++i) {
+ // compute the Viterbi path
+ HmmAlgorithms.viterbiAlgorithm(viterbiPath, delta, phi, lastIteration,
+ observedSequence, scaled);
+ // Viterbi iteration uses the viterbi path to update
+ // the probabilities
+ Matrix emissionMatrix = iteration.getEmissionMatrix();
+ Matrix transitionMatrix = iteration.getTransitionMatrix();
+
+ // first, assign the pseudo count
+ emissionMatrix.assign(pseudoCount);
+ transitionMatrix.assign(pseudoCount);
+
+ // now count the transitions
+ countTransitions(transitionMatrix, emissionMatrix, observedSequence,
+ viterbiPath);
+
+ // and normalize the probabilities
+ for (int j = 0; j < iteration.getNrOfHiddenStates(); ++j) {
+ double sum = 0;
+ // normalize the rows of the transition matrix
+ for (int k = 0; k < iteration.getNrOfHiddenStates(); ++k) {
+ sum += transitionMatrix.getQuick(j, k);
+ }
+ for (int k = 0; k < iteration.getNrOfHiddenStates(); ++k) {
+ transitionMatrix
+ .setQuick(j, k, transitionMatrix.getQuick(j, k) / sum);
+ }
+ // normalize the rows of the emission matrix
+ sum = 0;
+ for (int k = 0; k < iteration.getNrOfOutputStates(); ++k) {
+ sum += emissionMatrix.getQuick(j, k);
+ }
+ for (int k = 0; k < iteration.getNrOfOutputStates(); ++k) {
+ emissionMatrix.setQuick(j, k, emissionMatrix.getQuick(j, k) / sum);
+ }
+ }
+ // check for convergence
+ if (checkConvergence(lastIteration, iteration, epsilon)) {
+ break;
+ }
+ // overwrite the last iterated model by the new iteration
+ lastIteration.assign(iteration);
+ }
+ // we are done :)
+ return iteration;
+ }
+
+ /**
+ * Iteratively train the parameters of the given initial model wrt the
+ * observed sequence using Baum-Welch training.
+ *
+ * @param initialModel The initial model that gets iterated
+ * @param observedSequence The sequence of observed states
+ * @param epsilon Convergence criteria
+ * @param maxIterations The maximum number of training iterations
+ * @param scaled Use log-scaled implementations of forward/backward algorithm. This
+ * is computationally more expensive, but offers better numerical
+ * stability for long output sequences.
+ * @return The iterated model
+ */
+ public static HmmModel trainBaumWelch(HmmModel initialModel,
+ int[] observedSequence, double epsilon, int maxIterations, boolean scaled) {
+ // allocate space for the iterations
+ HmmModel lastIteration = initialModel.clone();
+ HmmModel iteration = initialModel.clone();
+
+ // allocate space for baum-welch factors
+ int hiddenCount = initialModel.getNrOfHiddenStates();
+ int visibleCount = observedSequence.length;
+ Matrix alpha = new DenseMatrix(visibleCount, hiddenCount);
+ Matrix beta = new DenseMatrix(visibleCount, hiddenCount);
+
+ // now run the baum Welch training iteration
+ for (int it = 0; it < maxIterations; ++it) {
+ // fetch emission and transition matrix of current iteration
+ Vector initialProbabilities = iteration.getInitialProbabilities();
+ Matrix emissionMatrix = iteration.getEmissionMatrix();
+ Matrix transitionMatrix = iteration.getTransitionMatrix();
+
+ // compute forward and backward factors
+ HmmAlgorithms.forwardAlgorithm(alpha, iteration, observedSequence, scaled);
+ HmmAlgorithms.backwardAlgorithm(beta, iteration, observedSequence, scaled);
+
+ if (scaled) {
+ logScaledBaumWelch(observedSequence, iteration, alpha, beta);
+ } else {
+ unscaledBaumWelch(observedSequence, iteration, alpha, beta);
+ }
+ // normalize transition/emission probabilities
+ // and normalize the probabilities
+ double isum = 0;
+ for (int j = 0; j < iteration.getNrOfHiddenStates(); ++j) {
+ double sum = 0;
+ // normalize the rows of the transition matrix
+ for (int k = 0; k < iteration.getNrOfHiddenStates(); ++k) {
+ sum += transitionMatrix.getQuick(j, k);
+ }
+ for (int k = 0; k < iteration.getNrOfHiddenStates(); ++k) {
+ transitionMatrix
+ .setQuick(j, k, transitionMatrix.getQuick(j, k) / sum);
+ }
+ // normalize the rows of the emission matrix
+ sum = 0;
+ for (int k = 0; k < iteration.getNrOfOutputStates(); ++k) {
+ sum += emissionMatrix.getQuick(j, k);
+ }
+ for (int k = 0; k < iteration.getNrOfOutputStates(); ++k) {
+ emissionMatrix.setQuick(j, k, emissionMatrix.getQuick(j, k) / sum);
+ }
+ // normalization parameter for initial probabilities
+ isum += initialProbabilities.getQuick(j);
+ }
+ // normalize initial probabilities
+ for (int i = 0; i < iteration.getNrOfHiddenStates(); ++i) {
+ initialProbabilities.setQuick(i, initialProbabilities.getQuick(i)
+ / isum);
+ }
+ // check for convergence
+ if (checkConvergence(lastIteration, iteration, epsilon)) {
+ break;
+ }
+ // overwrite the last iterated model by the new iteration
+ lastIteration.assign(iteration);
+ }
+ // we are done :)
+ return iteration;
+ }
+
+ private static void unscaledBaumWelch(int[] observedSequence, HmmModel iteration, Matrix alpha, Matrix beta) {
+ Vector initialProbabilities = iteration.getInitialProbabilities();
+ Matrix emissionMatrix = iteration.getEmissionMatrix();
+ Matrix transitionMatrix = iteration.getTransitionMatrix();
+ double modelLikelihood = HmmEvaluator.modelLikelihood(alpha, false);
+
+ for (int i = 0; i < iteration.getNrOfHiddenStates(); ++i) {
+ initialProbabilities.setQuick(i, alpha.getQuick(0, i)
+ * beta.getQuick(0, i));
+ }
+
+ // recompute transition probabilities
+ for (int i = 0; i < iteration.getNrOfHiddenStates(); ++i) {
+ for (int j = 0; j < iteration.getNrOfHiddenStates(); ++j) {
+ double temp = 0;
+ for (int t = 0; t < observedSequence.length - 1; ++t) {
+ temp += alpha.getQuick(t, i)
+ * emissionMatrix.getQuick(j, observedSequence[t + 1])
+ * beta.getQuick(t + 1, j);
+ }
+ transitionMatrix.setQuick(i, j, transitionMatrix.getQuick(i, j)
+ * temp / modelLikelihood);
+ }
+ }
+ // recompute emission probabilities
+ for (int i = 0; i < iteration.getNrOfHiddenStates(); ++i) {
+ for (int j = 0; j < iteration.getNrOfOutputStates(); ++j) {
+ double temp = 0;
+ for (int t = 0; t < observedSequence.length; ++t) {
+ // delta tensor
+ if (observedSequence[t] == j) {
+ temp += alpha.getQuick(t, i) * beta.getQuick(t, i);
+ }
+ }
+ emissionMatrix.setQuick(i, j, temp / modelLikelihood);
+ }
+ }
+ }
+
+ private static void logScaledBaumWelch(int[] observedSequence, HmmModel iteration, Matrix alpha, Matrix beta) {
+ Vector initialProbabilities = iteration.getInitialProbabilities();
+ Matrix emissionMatrix = iteration.getEmissionMatrix();
+ Matrix transitionMatrix = iteration.getTransitionMatrix();
+ double modelLikelihood = HmmEvaluator.modelLikelihood(alpha, true);
+
+ for (int i = 0; i < iteration.getNrOfHiddenStates(); ++i) {
+ initialProbabilities.setQuick(i, Math.exp(alpha.getQuick(0, i) + beta.getQuick(0, i)));
+ }
+
+ // recompute transition probabilities
+ for (int i = 0; i < iteration.getNrOfHiddenStates(); ++i) {
+ for (int j = 0; j < iteration.getNrOfHiddenStates(); ++j) {
+ double sum = Double.NEGATIVE_INFINITY; // log(0)
+ for (int t = 0; t < observedSequence.length - 1; ++t) {
+ double temp = alpha.getQuick(t, i)
+ + Math.log(emissionMatrix.getQuick(j, observedSequence[t + 1]))
+ + beta.getQuick(t + 1, j);
+ if (temp > Double.NEGATIVE_INFINITY) {
+ // handle 0-probabilities
+ sum = temp + Math.log1p(Math.exp(sum - temp));
+ }
+ }
+ transitionMatrix.setQuick(i, j, transitionMatrix.getQuick(i, j)
+ * Math.exp(sum - modelLikelihood));
+ }
+ }
+ // recompute emission probabilities
+ for (int i = 0; i < iteration.getNrOfHiddenStates(); ++i) {
+ for (int j = 0; j < iteration.getNrOfOutputStates(); ++j) {
+ double sum = Double.NEGATIVE_INFINITY; // log(0)
+ for (int t = 0; t < observedSequence.length; ++t) {
+ // delta tensor
+ if (observedSequence[t] == j) {
+ double temp = alpha.getQuick(t, i) + beta.getQuick(t, i);
+ if (temp > Double.NEGATIVE_INFINITY) {
+ // handle 0-probabilities
+ sum = temp + Math.log1p(Math.exp(sum - temp));
+ }
+ }
+ }
+ emissionMatrix.setQuick(i, j, Math.exp(sum - modelLikelihood));
+ }
+ }
+ }
+
+ /**
+ * Check convergence of two HMM models by computing a simple distance between
+ * emission / transition matrices
+ *
+ * @param oldModel Old HMM Model
+ * @param newModel New HMM Model
+ * @param epsilon Convergence Factor
+ * @return true if training converged to a stable state.
+ */
+ private static boolean checkConvergence(HmmModel oldModel, HmmModel newModel,
+ double epsilon) {
+ // check convergence of transitionProbabilities
+ Matrix oldTransitionMatrix = oldModel.getTransitionMatrix();
+ Matrix newTransitionMatrix = newModel.getTransitionMatrix();
+ double diff = 0;
+ for (int i = 0; i < oldModel.getNrOfHiddenStates(); ++i) {
+ for (int j = 0; j < oldModel.getNrOfHiddenStates(); ++j) {
+ double tmp = oldTransitionMatrix.getQuick(i, j)
+ - newTransitionMatrix.getQuick(i, j);
+ diff += tmp * tmp;
+ }
+ }
+ double norm = Math.sqrt(diff);
+ diff = 0;
+ // check convergence of emissionProbabilities
+ Matrix oldEmissionMatrix = oldModel.getEmissionMatrix();
+ Matrix newEmissionMatrix = newModel.getEmissionMatrix();
+ for (int i = 0; i < oldModel.getNrOfHiddenStates(); i++) {
+ for (int j = 0; j < oldModel.getNrOfOutputStates(); j++) {
+
+ double tmp = oldEmissionMatrix.getQuick(i, j)
+ - newEmissionMatrix.getQuick(i, j);
+ diff += tmp * tmp;
+ }
+ }
+ norm += Math.sqrt(diff);
+ // iteration has converged :)
+ return norm < epsilon;
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/HmmUtils.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/HmmUtils.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/HmmUtils.java
new file mode 100644
index 0000000..e710816
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/HmmUtils.java
@@ -0,0 +1,360 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sequencelearning.hmm;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.List;
+
+import com.google.common.base.Preconditions;
+import org.apache.mahout.math.DenseMatrix;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.Matrix;
+import org.apache.mahout.math.RandomAccessSparseVector;
+import org.apache.mahout.math.SparseMatrix;
+import org.apache.mahout.math.Vector;
+
+/**
+ * A collection of utilities for handling HMMModel objects.
+ */
+public final class HmmUtils {
+
+ /**
+ * No public constructor for utility classes.
+ */
+ private HmmUtils() {
+ // nothing to do here really.
+ }
+
+ /**
+ * Compute the cumulative transition probability matrix for the given HMM
+ * model. Matrix where each row i is the cumulative distribution of the
+ * transition probability distribution for hidden state i.
+ *
+ * @param model The HMM model for which the cumulative transition matrix should be
+ * computed
+ * @return The computed cumulative transition matrix.
+ */
+ public static Matrix getCumulativeTransitionMatrix(HmmModel model) {
+ // fetch the needed parameters from the model
+ int hiddenStates = model.getNrOfHiddenStates();
+ Matrix transitionMatrix = model.getTransitionMatrix();
+ // now compute the cumulative transition matrix
+ Matrix resultMatrix = new DenseMatrix(hiddenStates, hiddenStates);
+ for (int i = 0; i < hiddenStates; ++i) {
+ double sum = 0;
+ for (int j = 0; j < hiddenStates; ++j) {
+ sum += transitionMatrix.get(i, j);
+ resultMatrix.set(i, j, sum);
+ }
+ resultMatrix.set(i, hiddenStates - 1, 1.0);
+ // make sure the last
+ // state has always a
+ // cumulative
+ // probability of
+ // exactly 1.0
+ }
+ return resultMatrix;
+ }
+
+ /**
+ * Compute the cumulative output probability matrix for the given HMM model.
+ * Matrix where each row i is the cumulative distribution of the output
+ * probability distribution for hidden state i.
+ *
+ * @param model The HMM model for which the cumulative output matrix should be
+ * computed
+ * @return The computed cumulative output matrix.
+ */
+ public static Matrix getCumulativeOutputMatrix(HmmModel model) {
+ // fetch the needed parameters from the model
+ int hiddenStates = model.getNrOfHiddenStates();
+ int outputStates = model.getNrOfOutputStates();
+ Matrix outputMatrix = model.getEmissionMatrix();
+ // now compute the cumulative output matrix
+ Matrix resultMatrix = new DenseMatrix(hiddenStates, outputStates);
+ for (int i = 0; i < hiddenStates; ++i) {
+ double sum = 0;
+ for (int j = 0; j < outputStates; ++j) {
+ sum += outputMatrix.get(i, j);
+ resultMatrix.set(i, j, sum);
+ }
+ resultMatrix.set(i, outputStates - 1, 1.0);
+ // make sure the last
+ // output state has
+ // always a cumulative
+ // probability of 1.0
+ }
+ return resultMatrix;
+ }
+
+ /**
+ * Compute the cumulative distribution of the initial hidden state
+ * probabilities for the given HMM model.
+ *
+ * @param model The HMM model for which the cumulative initial state probabilities
+ * should be computed
+ * @return The computed cumulative initial state probability vector.
+ */
+ public static Vector getCumulativeInitialProbabilities(HmmModel model) {
+ // fetch the needed parameters from the model
+ int hiddenStates = model.getNrOfHiddenStates();
+ Vector initialProbabilities = model.getInitialProbabilities();
+ // now compute the cumulative output matrix
+ Vector resultVector = new DenseVector(initialProbabilities.size());
+ double sum = 0;
+ for (int i = 0; i < hiddenStates; ++i) {
+ sum += initialProbabilities.get(i);
+ resultVector.set(i, sum);
+ }
+ resultVector.set(hiddenStates - 1, 1.0); // make sure the last initial
+ // hidden state probability
+ // has always a cumulative
+ // probability of 1.0
+ return resultVector;
+ }
+
+ /**
+ * Validates an HMM model set
+ *
+ * @param model model to sanity check.
+ */
+ public static void validate(HmmModel model) {
+ if (model == null) {
+ return; // empty models are valid
+ }
+
+ /*
+ * The number of hidden states is positive.
+ */
+ Preconditions.checkArgument(model.getNrOfHiddenStates() > 0,
+ "Error: The number of hidden states has to be greater than 0");
+
+ /*
+ * The number of output states is positive.
+ */
+ Preconditions.checkArgument(model.getNrOfOutputStates() > 0,
+ "Error: The number of output states has to be greater than 0!");
+
+ /*
+ * The size of the vector of initial probabilities is equal to the number of
+ * the hidden states. Each initial probability is non-negative. The sum of
+ * initial probabilities is equal to 1.
+ */
+ Preconditions.checkArgument(model.getInitialProbabilities() != null
+ && model.getInitialProbabilities().size() == model.getNrOfHiddenStates(),
+ "Error: The vector of initial probabilities is not initialized!");
+
+ double sum = 0;
+ for (int i = 0; i < model.getInitialProbabilities().size(); i++) {
+ Preconditions.checkArgument(model.getInitialProbabilities().get(i) >= 0,
+ "Error: Initial probability of state %d is negative", i);
+ sum += model.getInitialProbabilities().get(i);
+ }
+ Preconditions.checkArgument(Math.abs(sum - 1) <= 0.00001,
+ "Error: Initial probabilities do not add up to 1");
+ /*
+ * The row size of the output matrix is equal to the number of the hidden
+ * states. The column size is equal to the number of output states. Each
+ * probability of the matrix is non-negative. The sum of each row is equal
+ * to 1.
+ */
+ Preconditions.checkNotNull(model.getEmissionMatrix(), "Error: The output state matrix is not initialized!");
+ Preconditions.checkArgument(model.getEmissionMatrix().numRows() == model.getNrOfHiddenStates()
+ && model.getEmissionMatrix().numCols() == model.getNrOfOutputStates(),
+ "Error: The output state matrix is not of the form nrOfHiddenStates x nrOfOutputStates");
+ for (int i = 0; i < model.getEmissionMatrix().numRows(); i++) {
+ sum = 0;
+ for (int j = 0; j < model.getEmissionMatrix().numCols(); j++) {
+ Preconditions.checkArgument(model.getEmissionMatrix().get(i, j) >= 0,
+ "The output state probability from hidden state " + i + " to output state " + j + " is negative");
+ sum += model.getEmissionMatrix().get(i, j);
+ }
+ Preconditions.checkArgument(Math.abs(sum - 1) <= 0.00001,
+ "Error: The output state probabilities for hidden state %d don't add up to 1", i);
+ }
+
+ /*
+ * The size of both dimension of the transition matrix is equal to the
+ * number of the hidden states. Each probability of the matrix is
+ * non-negative. The sum of each row in transition matrix is equal to 1.
+ */
+ Preconditions.checkArgument(model.getTransitionMatrix() != null,
+ "Error: The hidden state matrix is not initialized!");
+ Preconditions.checkArgument(model.getTransitionMatrix().numRows() == model.getNrOfHiddenStates()
+ && model.getTransitionMatrix().numCols() == model.getNrOfHiddenStates(),
+ "Error: The output state matrix is not of the form nrOfHiddenStates x nrOfHiddenStates");
+ for (int i = 0; i < model.getTransitionMatrix().numRows(); i++) {
+ sum = 0;
+ for (int j = 0; j < model.getTransitionMatrix().numCols(); j++) {
+ Preconditions.checkArgument(model.getTransitionMatrix().get(i, j) >= 0,
+ "Error: The transition probability from hidden state %d to hidden state %d is negative", i, j);
+ sum += model.getTransitionMatrix().get(i, j);
+ }
+ Preconditions.checkArgument(Math.abs(sum - 1) <= 0.00001,
+ "Error: The transition probabilities for hidden state " + i + " don't add up to 1.");
+ }
+ }
+
+ /**
+ * Encodes a given collection of state names by the corresponding state IDs
+ * registered in a given model.
+ *
+ * @param model Model to provide the encoding for
+ * @param sequence Collection of state names
+ * @param observed If set, the sequence is encoded as a sequence of observed states,
+ * else it is encoded as sequence of hidden states
+ * @param defaultValue The default value in case a state is not known
+ * @return integer array containing the encoded state IDs
+ */
+ public static int[] encodeStateSequence(HmmModel model,
+ Collection<String> sequence, boolean observed, int defaultValue) {
+ int[] encoded = new int[sequence.size()];
+ Iterator<String> seqIter = sequence.iterator();
+ for (int i = 0; i < sequence.size(); ++i) {
+ String nextState = seqIter.next();
+ int nextID;
+ if (observed) {
+ nextID = model.getOutputStateID(nextState);
+ } else {
+ nextID = model.getHiddenStateID(nextState);
+ }
+ // if the ID is -1, use the default value
+ encoded[i] = nextID < 0 ? defaultValue : nextID;
+ }
+ return encoded;
+ }
+
+ /**
+ * Decodes a given collection of state IDs into the corresponding state names
+ * registered in a given model.
+ *
+ * @param model model to use for retrieving state names
+ * @param sequence int array of state IDs
+ * @param observed If set, the sequence is encoded as a sequence of observed states,
+ * else it is encoded as sequence of hidden states
+ * @param defaultValue The default value in case a state is not known
+ * @return list containing the decoded state names
+ */
+ public static List<String> decodeStateSequence(HmmModel model,
+ int[] sequence,
+ boolean observed,
+ String defaultValue) {
+ List<String> decoded = new ArrayList<>(sequence.length);
+ for (int position : sequence) {
+ String nextState;
+ if (observed) {
+ nextState = model.getOutputStateName(position);
+ } else {
+ nextState = model.getHiddenStateName(position);
+ }
+ // if null was returned, use the default value
+ decoded.add(nextState == null ? defaultValue : nextState);
+ }
+ return decoded;
+ }
+
+ /**
+ * Function used to normalize the probabilities of a given HMM model
+ *
+ * @param model model to normalize
+ */
+ public static void normalizeModel(HmmModel model) {
+ Vector ip = model.getInitialProbabilities();
+ Matrix emission = model.getEmissionMatrix();
+ Matrix transition = model.getTransitionMatrix();
+ // check normalization for all probabilities
+ double isum = 0;
+ for (int i = 0; i < model.getNrOfHiddenStates(); ++i) {
+ isum += ip.getQuick(i);
+ double sum = 0;
+ for (int j = 0; j < model.getNrOfHiddenStates(); ++j) {
+ sum += transition.getQuick(i, j);
+ }
+ if (sum != 1.0) {
+ for (int j = 0; j < model.getNrOfHiddenStates(); ++j) {
+ transition.setQuick(i, j, transition.getQuick(i, j) / sum);
+ }
+ }
+ sum = 0;
+ for (int j = 0; j < model.getNrOfOutputStates(); ++j) {
+ sum += emission.getQuick(i, j);
+ }
+ if (sum != 1.0) {
+ for (int j = 0; j < model.getNrOfOutputStates(); ++j) {
+ emission.setQuick(i, j, emission.getQuick(i, j) / sum);
+ }
+ }
+ }
+ if (isum != 1.0) {
+ for (int i = 0; i < model.getNrOfHiddenStates(); ++i) {
+ ip.setQuick(i, ip.getQuick(i) / isum);
+ }
+ }
+ }
+
+ /**
+ * Method to reduce the size of an HMMmodel by converting the models
+ * DenseMatrix/DenseVectors to sparse implementations and setting every value
+ * < threshold to 0
+ *
+ * @param model model to truncate
+ * @param threshold minimum value a model entry must have to be retained.
+ * @return Truncated model
+ */
+ public static HmmModel truncateModel(HmmModel model, double threshold) {
+ Vector ip = model.getInitialProbabilities();
+ Matrix em = model.getEmissionMatrix();
+ Matrix tr = model.getTransitionMatrix();
+ // allocate the sparse data structures
+ RandomAccessSparseVector sparseIp = new RandomAccessSparseVector(model
+ .getNrOfHiddenStates());
+ SparseMatrix sparseEm = new SparseMatrix(model.getNrOfHiddenStates(), model.getNrOfOutputStates());
+ SparseMatrix sparseTr = new SparseMatrix(model.getNrOfHiddenStates(), model.getNrOfHiddenStates());
+ // now transfer the values
+ for (int i = 0; i < model.getNrOfHiddenStates(); ++i) {
+ double value = ip.getQuick(i);
+ if (value > threshold) {
+ sparseIp.setQuick(i, value);
+ }
+ for (int j = 0; j < model.getNrOfHiddenStates(); ++j) {
+ value = tr.getQuick(i, j);
+ if (value > threshold) {
+ sparseTr.setQuick(i, j, value);
+ }
+ }
+
+ for (int j = 0; j < model.getNrOfOutputStates(); ++j) {
+ value = em.getQuick(i, j);
+ if (value > threshold) {
+ sparseEm.setQuick(i, j, value);
+ }
+ }
+ }
+ // create a new model
+ HmmModel sparseModel = new HmmModel(sparseTr, sparseEm, sparseIp);
+ // normalize the model
+ normalizeModel(sparseModel);
+ // register the names
+ sparseModel.registerHiddenStateNames(model.getHiddenStateNames());
+ sparseModel.registerOutputStateNames(model.getOutputStateNames());
+ // and return
+ return sparseModel;
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/LossyHmmSerializer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/LossyHmmSerializer.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/LossyHmmSerializer.java
new file mode 100644
index 0000000..d0ae9c2
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/LossyHmmSerializer.java
@@ -0,0 +1,62 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sequencelearning.hmm;
+
+import org.apache.mahout.math.Matrix;
+import org.apache.mahout.math.MatrixWritable;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+/**
+ * Utils for serializing Writable parts of HmmModel (that means without hidden state names and so on)
+ */
+final class LossyHmmSerializer {
+
+ private LossyHmmSerializer() {
+ }
+
+ static void serialize(HmmModel model, DataOutput output) throws IOException {
+ MatrixWritable matrix = new MatrixWritable(model.getEmissionMatrix());
+ matrix.write(output);
+ matrix.set(model.getTransitionMatrix());
+ matrix.write(output);
+
+ VectorWritable vector = new VectorWritable(model.getInitialProbabilities());
+ vector.write(output);
+ }
+
+ static HmmModel deserialize(DataInput input) throws IOException {
+ MatrixWritable matrix = new MatrixWritable();
+ matrix.readFields(input);
+ Matrix emissionMatrix = matrix.get();
+
+ matrix.readFields(input);
+ Matrix transitionMatrix = matrix.get();
+
+ VectorWritable vector = new VectorWritable();
+ vector.readFields(input);
+ Vector initialProbabilities = vector.get();
+
+ return new HmmModel(transitionMatrix, emissionMatrix, initialProbabilities);
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/RandomSequenceGenerator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/RandomSequenceGenerator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/RandomSequenceGenerator.java
new file mode 100644
index 0000000..02baef1
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/RandomSequenceGenerator.java
@@ -0,0 +1,102 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.mahout.classifier.sequencelearning.hmm;
+
+import java.io.DataInputStream;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.PrintWriter;
+
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.OptionException;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.commandline.Parser;
+import org.apache.commons.io.Charsets;
+import org.apache.mahout.common.CommandLineUtil;
+
+/**
+ * Command-line tool for generating random sequences by given HMM
+ */
+public final class RandomSequenceGenerator {
+
+ private RandomSequenceGenerator() {
+ }
+
+ public static void main(String[] args) throws IOException {
+ DefaultOptionBuilder optionBuilder = new DefaultOptionBuilder();
+ ArgumentBuilder argumentBuilder = new ArgumentBuilder();
+
+ Option outputOption = optionBuilder.withLongName("output").
+ withDescription("Output file with sequence of observed states").
+ withShortName("o").withArgument(argumentBuilder.withMaximum(1).withMinimum(1).
+ withName("path").create()).withRequired(false).create();
+
+ Option modelOption = optionBuilder.withLongName("model").
+ withDescription("Path to serialized HMM model").
+ withShortName("m").withArgument(argumentBuilder.withMaximum(1).withMinimum(1).
+ withName("path").create()).withRequired(true).create();
+
+ Option lengthOption = optionBuilder.withLongName("length").
+ withDescription("Length of generated sequence").
+ withShortName("l").withArgument(argumentBuilder.withMaximum(1).withMinimum(1).
+ withName("number").create()).withRequired(true).create();
+
+ Group optionGroup = new GroupBuilder().
+ withOption(outputOption).withOption(modelOption).withOption(lengthOption).
+ withName("Options").create();
+
+ try {
+ Parser parser = new Parser();
+ parser.setGroup(optionGroup);
+ CommandLine commandLine = parser.parse(args);
+
+ String output = (String) commandLine.getValue(outputOption);
+
+ String modelPath = (String) commandLine.getValue(modelOption);
+
+ int length = Integer.parseInt((String) commandLine.getValue(lengthOption));
+
+ //reading serialized HMM
+ HmmModel model;
+ try (DataInputStream modelStream = new DataInputStream(new FileInputStream(modelPath))){
+ model = LossyHmmSerializer.deserialize(modelStream);
+ }
+
+ //generating observations
+ int[] observations = HmmEvaluator.predict(model, length, System.currentTimeMillis());
+
+ //writing output
+ try (PrintWriter writer =
+ new PrintWriter(new OutputStreamWriter(new FileOutputStream(output), Charsets.UTF_8), true)){
+ for (int observation : observations) {
+ writer.print(observation);
+ writer.print(' ');
+ }
+ }
+ } catch (OptionException e) {
+ CommandLineUtil.printHelp(optionGroup);
+ }
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/ViterbiEvaluator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/ViterbiEvaluator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/ViterbiEvaluator.java
new file mode 100644
index 0000000..317237d
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/ViterbiEvaluator.java
@@ -0,0 +1,122 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sequencelearning.hmm;
+
+import java.io.DataInputStream;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.PrintWriter;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Scanner;
+
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.OptionException;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.commandline.Parser;
+import org.apache.commons.io.Charsets;
+import org.apache.mahout.common.CommandLineUtil;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
+
+/**
+ * Command-line tool for Viterbi evaluating
+ */
+public final class ViterbiEvaluator {
+
+ private ViterbiEvaluator() {
+ }
+
+ public static void main(String[] args) throws IOException {
+ DefaultOptionBuilder optionBuilder = new DefaultOptionBuilder();
+ ArgumentBuilder argumentBuilder = new ArgumentBuilder();
+
+ Option inputOption = DefaultOptionCreator.inputOption().create();
+
+ Option outputOption = DefaultOptionCreator.outputOption().create();
+
+ Option modelOption = optionBuilder.withLongName("model").
+ withDescription("Path to serialized HMM model").
+ withShortName("m").withArgument(argumentBuilder.withMaximum(1).withMinimum(1).
+ withName("path").create()).withRequired(true).create();
+
+ Option likelihoodOption = optionBuilder.withLongName("likelihood").
+ withDescription("Compute likelihood of observed sequence").
+ withShortName("l").withRequired(false).create();
+
+ Group optionGroup = new GroupBuilder().withOption(inputOption).
+ withOption(outputOption).withOption(modelOption).withOption(likelihoodOption).
+ withName("Options").create();
+
+ try {
+ Parser parser = new Parser();
+ parser.setGroup(optionGroup);
+ CommandLine commandLine = parser.parse(args);
+
+ String input = (String) commandLine.getValue(inputOption);
+ String output = (String) commandLine.getValue(outputOption);
+
+ String modelPath = (String) commandLine.getValue(modelOption);
+
+ boolean computeLikelihood = commandLine.hasOption(likelihoodOption);
+
+ //reading serialized HMM
+ ;
+ HmmModel model;
+ try (DataInputStream modelStream = new DataInputStream(new FileInputStream(modelPath))) {
+ model = LossyHmmSerializer.deserialize(modelStream);
+ }
+
+ //reading observations
+ List<Integer> observations = new ArrayList<>();
+ try (Scanner scanner = new Scanner(new FileInputStream(input), "UTF-8")) {
+ while (scanner.hasNextInt()) {
+ observations.add(scanner.nextInt());
+ }
+ }
+
+ int[] observationsArray = new int[observations.size()];
+ for (int i = 0; i < observations.size(); ++i) {
+ observationsArray[i] = observations.get(i);
+ }
+
+ //decoding
+ int[] hiddenStates = HmmEvaluator.decode(model, observationsArray, true);
+
+ //writing output
+ try (PrintWriter writer =
+ new PrintWriter(new OutputStreamWriter(new FileOutputStream(output), Charsets.UTF_8), true)) {
+ for (int hiddenState : hiddenStates) {
+ writer.print(hiddenState);
+ writer.print(' ');
+ }
+ }
+
+ if (computeLikelihood) {
+ System.out.println("Likelihood: " + HmmEvaluator.modelLikelihood(model, observationsArray, true));
+ }
+ } catch (OptionException e) {
+ CommandLineUtil.printHelp(optionGroup);
+ }
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/AbstractOnlineLogisticRegression.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/AbstractOnlineLogisticRegression.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/AbstractOnlineLogisticRegression.java
new file mode 100644
index 0000000..0b2c41b
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/AbstractOnlineLogisticRegression.java
@@ -0,0 +1,317 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import org.apache.mahout.classifier.AbstractVectorClassifier;
+import org.apache.mahout.classifier.OnlineLearner;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.Matrix;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.Vector.Element;
+import org.apache.mahout.math.function.DoubleFunction;
+import org.apache.mahout.math.function.Functions;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * Generic definition of a 1 of n logistic regression classifier that returns probabilities in
+ * response to a feature vector. This classifier uses 1 of n-1 coding where the 0-th category
+ * is not stored explicitly.
+ * <p/>
+ * Provides the SGD based algorithm for learning a logistic regression, but omits all
+ * annealing of learning rates. Any extension of this abstract class must define the overall
+ * and per-term annealing for themselves.
+ */
+public abstract class AbstractOnlineLogisticRegression extends AbstractVectorClassifier implements OnlineLearner {
+ // coefficients for the classification. This is a dense matrix
+ // that is (numCategories-1) x numFeatures
+ protected Matrix beta;
+
+ // number of categories we are classifying. This should the number of rows of beta plus one.
+ protected int numCategories;
+
+ protected int step;
+
+ // information about how long since coefficient rows were updated. This allows lazy regularization.
+ protected Vector updateSteps;
+
+ // information about how many updates we have had on a location. This allows per-term
+ // annealing a la confidence weighted learning.
+ protected Vector updateCounts;
+
+ // weight of the prior on beta
+ private double lambda = 1.0e-5;
+ protected PriorFunction prior;
+
+ // can we ignore any further regularization when doing classification?
+ private boolean sealed;
+
+ // by default we don't do any fancy training
+ private Gradient gradient = new DefaultGradient();
+
+ /**
+ * Chainable configuration option.
+ *
+ * @param lambda New value of lambda, the weighting factor for the prior distribution.
+ * @return This, so other configurations can be chained.
+ */
+ public AbstractOnlineLogisticRegression lambda(double lambda) {
+ this.lambda = lambda;
+ return this;
+ }
+
+ /**
+ * Computes the inverse link function, by default the logistic link function.
+ *
+ * @param v The output of the linear combination in a GLM. Note that the value
+ * of v is disturbed.
+ * @return A version of v with the link function applied.
+ */
+ public static Vector link(Vector v) {
+ double max = v.maxValue();
+ if (max >= 40) {
+ // if max > 40, we subtract the large offset first
+ // the size of the max means that 1+sum(exp(v)) = sum(exp(v)) to within round-off
+ v.assign(Functions.minus(max)).assign(Functions.EXP);
+ return v.divide(v.norm(1));
+ } else {
+ v.assign(Functions.EXP);
+ return v.divide(1 + v.norm(1));
+ }
+ }
+
+ /**
+ * Computes the binomial logistic inverse link function.
+ *
+ * @param r The value to transform.
+ * @return The logit of r.
+ */
+ public static double link(double r) {
+ if (r < 0.0) {
+ double s = Math.exp(r);
+ return s / (1.0 + s);
+ } else {
+ double s = Math.exp(-r);
+ return 1.0 / (1.0 + s);
+ }
+ }
+
+ @Override
+ public Vector classifyNoLink(Vector instance) {
+ // apply pending regularization to whichever coefficients matter
+ regularize(instance);
+ return beta.times(instance);
+ }
+
+ public double classifyScalarNoLink(Vector instance) {
+ return beta.viewRow(0).dot(instance);
+ }
+
+ /**
+ * Returns n-1 probabilities, one for each category but the 0-th. The probability of the 0-th
+ * category is 1 - sum(this result).
+ *
+ * @param instance A vector of features to be classified.
+ * @return A vector of probabilities, one for each of the first n-1 categories.
+ */
+ @Override
+ public Vector classify(Vector instance) {
+ return link(classifyNoLink(instance));
+ }
+
+ /**
+ * Returns a single scalar probability in the case where we have two categories. Using this
+ * method avoids an extra vector allocation as opposed to calling classify() or an extra two
+ * vector allocations relative to classifyFull().
+ *
+ * @param instance The vector of features to be classified.
+ * @return The probability of the first of two categories.
+ * @throws IllegalArgumentException If the classifier doesn't have two categories.
+ */
+ @Override
+ public double classifyScalar(Vector instance) {
+ Preconditions.checkArgument(numCategories() == 2, "Can only call classifyScalar with two categories");
+
+ // apply pending regularization to whichever coefficients matter
+ regularize(instance);
+
+ // result is a vector with one element so we can just use dot product
+ return link(classifyScalarNoLink(instance));
+ }
+
+ @Override
+ public void train(long trackingKey, String groupKey, int actual, Vector instance) {
+ unseal();
+
+ double learningRate = currentLearningRate();
+
+ // push coefficients back to zero based on the prior
+ regularize(instance);
+
+ // update each row of coefficients according to result
+ Vector gradient = this.gradient.apply(groupKey, actual, instance, this);
+ for (int i = 0; i < numCategories - 1; i++) {
+ double gradientBase = gradient.get(i);
+
+ // then we apply the gradientBase to the resulting element.
+ for (Element updateLocation : instance.nonZeroes()) {
+ int j = updateLocation.index();
+
+ double newValue = beta.getQuick(i, j) + gradientBase * learningRate * perTermLearningRate(j) * instance.get(j);
+ beta.setQuick(i, j, newValue);
+ }
+ }
+
+ // remember that these elements got updated
+ for (Element element : instance.nonZeroes()) {
+ int j = element.index();
+ updateSteps.setQuick(j, getStep());
+ updateCounts.incrementQuick(j, 1);
+ }
+ nextStep();
+
+ }
+
+ @Override
+ public void train(long trackingKey, int actual, Vector instance) {
+ train(trackingKey, null, actual, instance);
+ }
+
+ @Override
+ public void train(int actual, Vector instance) {
+ train(0, null, actual, instance);
+ }
+
+ public void regularize(Vector instance) {
+ if (updateSteps == null || isSealed()) {
+ return;
+ }
+
+ // anneal learning rate
+ double learningRate = currentLearningRate();
+
+ // here we lazily apply the prior to make up for our neglect
+ for (int i = 0; i < numCategories - 1; i++) {
+ for (Element updateLocation : instance.nonZeroes()) {
+ int j = updateLocation.index();
+ double missingUpdates = getStep() - updateSteps.get(j);
+ if (missingUpdates > 0) {
+ double rate = getLambda() * learningRate * perTermLearningRate(j);
+ double newValue = prior.age(beta.get(i, j), missingUpdates, rate);
+ beta.set(i, j, newValue);
+ updateSteps.set(j, getStep());
+ }
+ }
+ }
+ }
+
+ // these two abstract methods are how extensions can modify the basic learning behavior of this object.
+
+ public abstract double perTermLearningRate(int j);
+
+ public abstract double currentLearningRate();
+
+ public void setPrior(PriorFunction prior) {
+ this.prior = prior;
+ }
+
+ public void setGradient(Gradient gradient) {
+ this.gradient = gradient;
+ }
+
+ public PriorFunction getPrior() {
+ return prior;
+ }
+
+ public Matrix getBeta() {
+ close();
+ return beta;
+ }
+
+ public void setBeta(int i, int j, double betaIJ) {
+ beta.set(i, j, betaIJ);
+ }
+
+ @Override
+ public int numCategories() {
+ return numCategories;
+ }
+
+ public int numFeatures() {
+ return beta.numCols();
+ }
+
+ public double getLambda() {
+ return lambda;
+ }
+
+ public int getStep() {
+ return step;
+ }
+
+ protected void nextStep() {
+ step++;
+ }
+
+ public boolean isSealed() {
+ return sealed;
+ }
+
+ protected void unseal() {
+ sealed = false;
+ }
+
+ private void regularizeAll() {
+ Vector all = new DenseVector(beta.numCols());
+ all.assign(1);
+ regularize(all);
+ }
+
+ @Override
+ public void close() {
+ if (!sealed) {
+ step++;
+ regularizeAll();
+ sealed = true;
+ }
+ }
+
+ public void copyFrom(AbstractOnlineLogisticRegression other) {
+ // number of categories we are classifying. This should the number of rows of beta plus one.
+ Preconditions.checkArgument(numCategories == other.numCategories,
+ "Can't copy unless number of target categories is the same");
+
+ beta.assign(other.beta);
+
+ step = other.step;
+
+ updateSteps.assign(other.updateSteps);
+ updateCounts.assign(other.updateCounts);
+ }
+
+ public boolean validModel() {
+ double k = beta.aggregate(Functions.PLUS, new DoubleFunction() {
+ @Override
+ public double apply(double v) {
+ return Double.isNaN(v) || Double.isInfinite(v) ? 1 : 0;
+ }
+ });
+ return k < 1;
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/AdaptiveLogisticRegression.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/AdaptiveLogisticRegression.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/AdaptiveLogisticRegression.java
new file mode 100644
index 0000000..24e5798
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/AdaptiveLogisticRegression.java
@@ -0,0 +1,586 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.classifier.OnlineLearner;
+import org.apache.mahout.ep.EvolutionaryProcess;
+import org.apache.mahout.ep.Mapping;
+import org.apache.mahout.ep.Payload;
+import org.apache.mahout.ep.State;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+import org.apache.mahout.math.stats.OnlineAuc;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Locale;
+import java.util.concurrent.ExecutionException;
+
+/**
+ * This is a meta-learner that maintains a pool of ordinary
+ * {@link org.apache.mahout.classifier.sgd.OnlineLogisticRegression} learners. Each
+ * member of the pool has different learning rates. Whichever of the learners in the pool falls
+ * behind in terms of average log-likelihood will be tossed out and replaced with variants of the
+ * survivors. This will let us automatically derive an annealing schedule that optimizes learning
+ * speed. Since on-line learners tend to be IO bound anyway, it doesn't cost as much as it might
+ * seem that it would to maintain multiple learners in memory. Doing this adaptation on-line as we
+ * learn also decreases the number of learning rate parameters required and replaces the normal
+ * hyper-parameter search.
+ * <p/>
+ * One wrinkle is that the pool of learners that we maintain is actually a pool of
+ * {@link org.apache.mahout.classifier.sgd.CrossFoldLearner} which themselves contain several OnlineLogisticRegression
+ * objects. These pools allow estimation
+ * of performance on the fly even if we make many passes through the data. This does, however,
+ * increase the cost of training since if we are using 5-fold cross-validation, each vector is used
+ * 4 times for training and once for classification. If this becomes a problem, then we should
+ * probably use a 2-way unbalanced train/test split rather than full cross validation. With the
+ * current default settings, we have 100 learners running. This is better than the alternative of
+ * running hundreds of training passes to find good hyper-parameters because we only have to parse
+ * and feature-ize our inputs once. If you already have good hyper-parameters, then you might
+ * prefer to just run one CrossFoldLearner with those settings.
+ * <p/>
+ * The fitness used here is AUC. Another alternative would be to try log-likelihood, but it is much
+ * easier to get bogus values of log-likelihood than with AUC and the results seem to accord pretty
+ * well. It would be nice to allow the fitness function to be pluggable. This use of AUC means that
+ * AdaptiveLogisticRegression is mostly suited for binary target variables. This will be fixed
+ * before long by extending OnlineAuc to handle non-binary cases or by using a different fitness
+ * value in non-binary cases.
+ */
+public class AdaptiveLogisticRegression implements OnlineLearner, Writable {
+ public static final int DEFAULT_THREAD_COUNT = 20;
+ public static final int DEFAULT_POOL_SIZE = 20;
+ private static final int SURVIVORS = 2;
+
+ private int record;
+ private int cutoff = 1000;
+ private int minInterval = 1000;
+ private int maxInterval = 1000;
+ private int currentStep = 1000;
+ private int bufferSize = 1000;
+
+ private List<TrainingExample> buffer = new ArrayList<>();
+ private EvolutionaryProcess<Wrapper, CrossFoldLearner> ep;
+ private State<Wrapper, CrossFoldLearner> best;
+ private int threadCount = DEFAULT_THREAD_COUNT;
+ private int poolSize = DEFAULT_POOL_SIZE;
+ private State<Wrapper, CrossFoldLearner> seed;
+ private int numFeatures;
+
+ private boolean freezeSurvivors = true;
+
+ private static final Logger log = LoggerFactory.getLogger(AdaptiveLogisticRegression.class);
+
+ public AdaptiveLogisticRegression() {}
+
+ /**
+ * Uses {@link #DEFAULT_THREAD_COUNT} and {@link #DEFAULT_POOL_SIZE}
+ * @param numCategories The number of categories (labels) to train on
+ * @param numFeatures The number of features used in creating the vectors (i.e. the cardinality of the vector)
+ * @param prior The {@link org.apache.mahout.classifier.sgd.PriorFunction} to use
+ *
+ * @see #AdaptiveLogisticRegression(int, int, org.apache.mahout.classifier.sgd.PriorFunction, int, int)
+ */
+ public AdaptiveLogisticRegression(int numCategories, int numFeatures, PriorFunction prior) {
+ this(numCategories, numFeatures, prior, DEFAULT_THREAD_COUNT, DEFAULT_POOL_SIZE);
+ }
+
+ /**
+ *
+ * @param numCategories The number of categories (labels) to train on
+ * @param numFeatures The number of features used in creating the vectors (i.e. the cardinality of the vector)
+ * @param prior The {@link org.apache.mahout.classifier.sgd.PriorFunction} to use
+ * @param threadCount The number of threads to use for training
+ * @param poolSize The number of {@link org.apache.mahout.classifier.sgd.CrossFoldLearner} to use.
+ */
+ public AdaptiveLogisticRegression(int numCategories, int numFeatures, PriorFunction prior, int threadCount,
+ int poolSize) {
+ this.numFeatures = numFeatures;
+ this.threadCount = threadCount;
+ this.poolSize = poolSize;
+ seed = new State<>(new double[2], 10);
+ Wrapper w = new Wrapper(numCategories, numFeatures, prior);
+ seed.setPayload(w);
+
+ Wrapper.setMappings(seed);
+ seed.setPayload(w);
+ setPoolSize(this.poolSize);
+ }
+
+ @Override
+ public void train(int actual, Vector instance) {
+ train(record, null, actual, instance);
+ }
+
+ @Override
+ public void train(long trackingKey, int actual, Vector instance) {
+ train(trackingKey, null, actual, instance);
+ }
+
+ @Override
+ public void train(long trackingKey, String groupKey, int actual, Vector instance) {
+ record++;
+
+ buffer.add(new TrainingExample(trackingKey, groupKey, actual, instance));
+ //don't train until we have enough examples
+ if (buffer.size() > bufferSize) {
+ trainWithBufferedExamples();
+ }
+ }
+
+ private void trainWithBufferedExamples() {
+ try {
+ this.best = ep.parallelDo(new EvolutionaryProcess.Function<Payload<CrossFoldLearner>>() {
+ @Override
+ public double apply(Payload<CrossFoldLearner> z, double[] params) {
+ Wrapper x = (Wrapper) z;
+ for (TrainingExample example : buffer) {
+ x.train(example);
+ }
+ if (x.getLearner().validModel()) {
+ if (x.getLearner().numCategories() == 2) {
+ return x.wrapped.auc();
+ } else {
+ return x.wrapped.logLikelihood();
+ }
+ } else {
+ return Double.NaN;
+ }
+ }
+ });
+ } catch (InterruptedException e) {
+ // ignore ... shouldn't happen
+ log.warn("Ignoring exception", e);
+ } catch (ExecutionException e) {
+ throw new IllegalStateException(e.getCause());
+ }
+ buffer.clear();
+
+ if (record > cutoff) {
+ cutoff = nextStep(record);
+
+ // evolve based on new fitness
+ ep.mutatePopulation(SURVIVORS);
+
+ if (freezeSurvivors) {
+ // now grossly hack the top survivors so they stick around. Set their
+ // mutation rates small and also hack their learning rate to be small
+ // as well.
+ for (State<Wrapper, CrossFoldLearner> state : ep.getPopulation().subList(0, SURVIVORS)) {
+ Wrapper.freeze(state);
+ }
+ }
+ }
+
+ }
+
+ public int nextStep(int recordNumber) {
+ int stepSize = stepSize(recordNumber, 2.6);
+ if (stepSize < minInterval) {
+ stepSize = minInterval;
+ }
+
+ if (stepSize > maxInterval) {
+ stepSize = maxInterval;
+ }
+
+ int newCutoff = stepSize * (recordNumber / stepSize + 1);
+ if (newCutoff < cutoff + currentStep) {
+ newCutoff = cutoff + currentStep;
+ } else {
+ this.currentStep = stepSize;
+ }
+ return newCutoff;
+ }
+
+ public static int stepSize(int recordNumber, double multiplier) {
+ int[] bumps = {1, 2, 5};
+ double log = Math.floor(multiplier * Math.log10(recordNumber));
+ int bump = bumps[(int) log % bumps.length];
+ int scale = (int) Math.pow(10, Math.floor(log / bumps.length));
+
+ return bump * scale;
+ }
+
+ @Override
+ public void close() {
+ trainWithBufferedExamples();
+ try {
+ ep.parallelDo(new EvolutionaryProcess.Function<Payload<CrossFoldLearner>>() {
+ @Override
+ public double apply(Payload<CrossFoldLearner> payload, double[] params) {
+ CrossFoldLearner learner = ((Wrapper) payload).getLearner();
+ learner.close();
+ return learner.logLikelihood();
+ }
+ });
+ } catch (InterruptedException e) {
+ log.warn("Ignoring exception", e);
+ } catch (ExecutionException e) {
+ throw new IllegalStateException(e);
+ } finally {
+ ep.close();
+ }
+ }
+
+ /**
+ * How often should the evolutionary optimization of learning parameters occur?
+ *
+ * @param interval Number of training examples to use in each epoch of optimization.
+ */
+ public void setInterval(int interval) {
+ setInterval(interval, interval);
+ }
+
+ /**
+ * Starts optimization using the shorter interval and progresses to the longer using the specified
+ * number of steps per decade. Note that values < 200 are not accepted. Values even that small
+ * are unlikely to be useful.
+ *
+ * @param minInterval The minimum epoch length for the evolutionary optimization
+ * @param maxInterval The maximum epoch length
+ */
+ public void setInterval(int minInterval, int maxInterval) {
+ this.minInterval = Math.max(200, minInterval);
+ this.maxInterval = Math.max(200, maxInterval);
+ this.cutoff = minInterval * (record / minInterval + 1);
+ this.currentStep = minInterval;
+ bufferSize = Math.min(minInterval, bufferSize);
+ }
+
+ public final void setPoolSize(int poolSize) {
+ this.poolSize = poolSize;
+ setupOptimizer(poolSize);
+ }
+
+ public void setThreadCount(int threadCount) {
+ this.threadCount = threadCount;
+ setupOptimizer(poolSize);
+ }
+
+ public void setAucEvaluator(OnlineAuc auc) {
+ seed.getPayload().setAucEvaluator(auc);
+ setupOptimizer(poolSize);
+ }
+
+ private void setupOptimizer(int poolSize) {
+ ep = new EvolutionaryProcess<>(threadCount, poolSize, seed);
+ }
+
+ /**
+ * Returns the size of the internal feature vector. Note that this is not the same as the number
+ * of distinct features, especially if feature hashing is being used.
+ *
+ * @return The internal feature vector size.
+ */
+ public int numFeatures() {
+ return numFeatures;
+ }
+
+ /**
+ * What is the AUC for the current best member of the population. If no member is best, usually
+ * because we haven't done any training yet, then the result is set to NaN.
+ *
+ * @return The AUC of the best member of the population or NaN if we can't figure that out.
+ */
+ public double auc() {
+ if (best == null) {
+ return Double.NaN;
+ } else {
+ Wrapper payload = best.getPayload();
+ return payload.getLearner().auc();
+ }
+ }
+
+ public State<Wrapper, CrossFoldLearner> getBest() {
+ return best;
+ }
+
+ public void setBest(State<Wrapper, CrossFoldLearner> best) {
+ this.best = best;
+ }
+
+ public int getRecord() {
+ return record;
+ }
+
+ public void setRecord(int record) {
+ this.record = record;
+ }
+
+ public int getMinInterval() {
+ return minInterval;
+ }
+
+ public int getMaxInterval() {
+ return maxInterval;
+ }
+
+ public int getNumCategories() {
+ return seed.getPayload().getLearner().numCategories();
+ }
+
+ public PriorFunction getPrior() {
+ return seed.getPayload().getLearner().getPrior();
+ }
+
+ public void setBuffer(List<TrainingExample> buffer) {
+ this.buffer = buffer;
+ }
+
+ public List<TrainingExample> getBuffer() {
+ return buffer;
+ }
+
+ public EvolutionaryProcess<Wrapper, CrossFoldLearner> getEp() {
+ return ep;
+ }
+
+ public void setEp(EvolutionaryProcess<Wrapper, CrossFoldLearner> ep) {
+ this.ep = ep;
+ }
+
+ public State<Wrapper, CrossFoldLearner> getSeed() {
+ return seed;
+ }
+
+ public void setSeed(State<Wrapper, CrossFoldLearner> seed) {
+ this.seed = seed;
+ }
+
+ public int getNumFeatures() {
+ return numFeatures;
+ }
+
+ public void setAveragingWindow(int averagingWindow) {
+ seed.getPayload().getLearner().setWindowSize(averagingWindow);
+ setupOptimizer(poolSize);
+ }
+
+ public void setFreezeSurvivors(boolean freezeSurvivors) {
+ this.freezeSurvivors = freezeSurvivors;
+ }
+
+ /**
+ * Provides a shim between the EP optimization stuff and the CrossFoldLearner. The most important
+ * interface has to do with the parameters of the optimization. These are taken from the double[]
+ * params in the following order <ul> <li> regularization constant lambda <li> learningRate </ul>.
+ * All other parameters are set in such a way so as to defeat annealing to the extent possible.
+ * This lets the evolutionary algorithm handle the annealing.
+ * <p/>
+ * Note that per coefficient annealing is still done and no optimization of the per coefficient
+ * offset is done.
+ */
+ public static class Wrapper implements Payload<CrossFoldLearner> {
+ private CrossFoldLearner wrapped;
+
+ public Wrapper() {
+ }
+
+ public Wrapper(int numCategories, int numFeatures, PriorFunction prior) {
+ wrapped = new CrossFoldLearner(5, numCategories, numFeatures, prior);
+ }
+
+ @Override
+ public Wrapper copy() {
+ Wrapper r = new Wrapper();
+ r.wrapped = wrapped.copy();
+ return r;
+ }
+
+ @Override
+ public void update(double[] params) {
+ int i = 0;
+ wrapped.lambda(params[i++]);
+ wrapped.learningRate(params[i]);
+
+ wrapped.stepOffset(1);
+ wrapped.alpha(1);
+ wrapped.decayExponent(0);
+ }
+
+ public static void freeze(State<Wrapper, CrossFoldLearner> s) {
+ // radically decrease learning rate
+ double[] params = s.getParams();
+ params[1] -= 10;
+
+ // and cause evolution to hold (almost)
+ s.setOmni(s.getOmni() / 20);
+ double[] step = s.getStep();
+ for (int i = 0; i < step.length; i++) {
+ step[i] /= 20;
+ }
+ }
+
+ public static void setMappings(State<Wrapper, CrossFoldLearner> x) {
+ int i = 0;
+ // set the range for regularization (lambda)
+ x.setMap(i++, Mapping.logLimit(1.0e-8, 0.1));
+ // set the range for learning rate (mu)
+ x.setMap(i, Mapping.logLimit(1.0e-8, 1));
+ }
+
+ public void train(TrainingExample example) {
+ wrapped.train(example.getKey(), example.getGroupKey(), example.getActual(), example.getInstance());
+ }
+
+ public CrossFoldLearner getLearner() {
+ return wrapped;
+ }
+
+ @Override
+ public String toString() {
+ return String.format(Locale.ENGLISH, "auc=%.2f", wrapped.auc());
+ }
+
+ public void setAucEvaluator(OnlineAuc auc) {
+ wrapped.setAucEvaluator(auc);
+ }
+
+ @Override
+ public void write(DataOutput out) throws IOException {
+ wrapped.write(out);
+ }
+
+ @Override
+ public void readFields(DataInput input) throws IOException {
+ wrapped = new CrossFoldLearner();
+ wrapped.readFields(input);
+ }
+ }
+
+ public static class TrainingExample implements Writable {
+ private long key;
+ private String groupKey;
+ private int actual;
+ private Vector instance;
+
+ private TrainingExample() {
+ }
+
+ public TrainingExample(long key, String groupKey, int actual, Vector instance) {
+ this.key = key;
+ this.groupKey = groupKey;
+ this.actual = actual;
+ this.instance = instance;
+ }
+
+ public long getKey() {
+ return key;
+ }
+
+ public int getActual() {
+ return actual;
+ }
+
+ public Vector getInstance() {
+ return instance;
+ }
+
+ public String getGroupKey() {
+ return groupKey;
+ }
+
+ @Override
+ public void write(DataOutput out) throws IOException {
+ out.writeLong(key);
+ if (groupKey != null) {
+ out.writeBoolean(true);
+ out.writeUTF(groupKey);
+ } else {
+ out.writeBoolean(false);
+ }
+ out.writeInt(actual);
+ VectorWritable.writeVector(out, instance, true);
+ }
+
+ @Override
+ public void readFields(DataInput in) throws IOException {
+ key = in.readLong();
+ if (in.readBoolean()) {
+ groupKey = in.readUTF();
+ }
+ actual = in.readInt();
+ instance = VectorWritable.readVector(in);
+ }
+ }
+
+ @Override
+ public void write(DataOutput out) throws IOException {
+ out.writeInt(record);
+ out.writeInt(cutoff);
+ out.writeInt(minInterval);
+ out.writeInt(maxInterval);
+ out.writeInt(currentStep);
+ out.writeInt(bufferSize);
+
+ out.writeInt(buffer.size());
+ for (TrainingExample example : buffer) {
+ example.write(out);
+ }
+
+ ep.write(out);
+
+ best.write(out);
+
+ out.writeInt(threadCount);
+ out.writeInt(poolSize);
+ seed.write(out);
+ out.writeInt(numFeatures);
+
+ out.writeBoolean(freezeSurvivors);
+ }
+
+ @Override
+ public void readFields(DataInput in) throws IOException {
+ record = in.readInt();
+ cutoff = in.readInt();
+ minInterval = in.readInt();
+ maxInterval = in.readInt();
+ currentStep = in.readInt();
+ bufferSize = in.readInt();
+
+ int n = in.readInt();
+ buffer = new ArrayList<>();
+ for (int i = 0; i < n; i++) {
+ TrainingExample example = new TrainingExample();
+ example.readFields(in);
+ buffer.add(example);
+ }
+
+ ep = new EvolutionaryProcess<>();
+ ep.readFields(in);
+
+ best = new State<>();
+ best.readFields(in);
+
+ threadCount = in.readInt();
+ poolSize = in.readInt();
+ seed = new State<>();
+ seed.readFields(in);
+
+ numFeatures = in.readInt();
+ freezeSurvivors = in.readBoolean();
+ }
+}
+
r***@apache.org
2018-06-28 14:54:42 UTC
Permalink
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/Builder.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/Builder.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/Builder.java
new file mode 100644
index 0000000..32d7b5c
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/Builder.java
@@ -0,0 +1,333 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.mapreduce;
+
+import com.google.common.base.Preconditions;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.filecache.DistributedCache;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.mahout.classifier.df.DecisionForest;
+import org.apache.mahout.classifier.df.builder.TreeBuilder;
+import org.apache.mahout.classifier.df.data.Dataset;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.StringUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Comparator;
+
+/**
+ * Base class for Mapred DecisionForest builders. Takes care of storing the parameters common to the mapred
+ * implementations.<br>
+ * The child classes must implement at least :
+ * <ul>
+ * <li>void configureJob(Job) : to further configure the job before its launch; and</li>
+ * <li>DecisionForest parseOutput(Job, PredictionCallback) : in order to convert the job outputs into a
+ * DecisionForest and its corresponding oob predictions</li>
+ * </ul>
+ *
+ */
+@Deprecated
+public abstract class Builder {
+
+ private static final Logger log = LoggerFactory.getLogger(Builder.class);
+
+ private final TreeBuilder treeBuilder;
+ private final Path dataPath;
+ private final Path datasetPath;
+ private final Long seed;
+ private final Configuration conf;
+ private String outputDirName = "output";
+
+ protected Builder(TreeBuilder treeBuilder, Path dataPath, Path datasetPath, Long seed, Configuration conf) {
+ this.treeBuilder = treeBuilder;
+ this.dataPath = dataPath;
+ this.datasetPath = datasetPath;
+ this.seed = seed;
+ this.conf = new Configuration(conf);
+ }
+
+ protected Path getDataPath() {
+ return dataPath;
+ }
+
+ /**
+ * Return the value of "mapred.map.tasks".
+ *
+ * @param conf
+ * configuration
+ * @return number of map tasks
+ */
+ public static int getNumMaps(Configuration conf) {
+ return conf.getInt("mapred.map.tasks", -1);
+ }
+
+ /**
+ * Used only for DEBUG purposes. if false, the mappers doesn't output anything, so the builder has nothing
+ * to process
+ *
+ * @param conf
+ * configuration
+ * @return true if the builder has to return output. false otherwise
+ */
+ protected static boolean isOutput(Configuration conf) {
+ return conf.getBoolean("debug.mahout.rf.output", true);
+ }
+
+ /**
+ * Returns the random seed
+ *
+ * @param conf
+ * configuration
+ * @return null if no seed is available
+ */
+ public static Long getRandomSeed(Configuration conf) {
+ String seed = conf.get("mahout.rf.random.seed");
+ if (seed == null) {
+ return null;
+ }
+
+ return Long.valueOf(seed);
+ }
+
+ /**
+ * Sets the random seed value
+ *
+ * @param conf
+ * configuration
+ * @param seed
+ * random seed
+ */
+ private static void setRandomSeed(Configuration conf, long seed) {
+ conf.setLong("mahout.rf.random.seed", seed);
+ }
+
+ public static TreeBuilder getTreeBuilder(Configuration conf) {
+ String string = conf.get("mahout.rf.treebuilder");
+ if (string == null) {
+ return null;
+ }
+
+ return StringUtils.fromString(string);
+ }
+
+ private static void setTreeBuilder(Configuration conf, TreeBuilder treeBuilder) {
+ conf.set("mahout.rf.treebuilder", StringUtils.toString(treeBuilder));
+ }
+
+ /**
+ * Get the number of trees for the map-reduce job.
+ *
+ * @param conf
+ * configuration
+ * @return number of trees to build
+ */
+ public static int getNbTrees(Configuration conf) {
+ return conf.getInt("mahout.rf.nbtrees", -1);
+ }
+
+ /**
+ * Set the number of trees to grow for the map-reduce job
+ *
+ * @param conf
+ * configuration
+ * @param nbTrees
+ * number of trees to build
+ * @throws IllegalArgumentException
+ * if (nbTrees <= 0)
+ */
+ public static void setNbTrees(Configuration conf, int nbTrees) {
+ Preconditions.checkArgument(nbTrees > 0, "nbTrees should be greater than 0");
+
+ conf.setInt("mahout.rf.nbtrees", nbTrees);
+ }
+
+ /**
+ * Sets the Output directory name, will be creating in the working directory
+ *
+ * @param name
+ * output dir. name
+ */
+ public void setOutputDirName(String name) {
+ outputDirName = name;
+ }
+
+ /**
+ * Output Directory name
+ *
+ * @param conf
+ * configuration
+ * @return output dir. path (%WORKING_DIRECTORY%/OUTPUT_DIR_NAME%)
+ * @throws IOException
+ * if we cannot get the default FileSystem
+ */
+ protected Path getOutputPath(Configuration conf) throws IOException {
+ // the output directory is accessed only by this class, so use the default
+ // file system
+ FileSystem fs = FileSystem.get(conf);
+ return new Path(fs.getWorkingDirectory(), outputDirName);
+ }
+
+ /**
+ * Helper method. Get a path from the DistributedCache
+ *
+ * @param conf
+ * configuration
+ * @param index
+ * index of the path in the DistributedCache files
+ * @return path from the DistributedCache
+ * @throws IOException
+ * if no path is found
+ */
+ public static Path getDistributedCacheFile(Configuration conf, int index) throws IOException {
+ Path[] files = HadoopUtil.getCachedFiles(conf);
+
+ if (files.length <= index) {
+ throw new IOException("path not found in the DistributedCache");
+ }
+
+ return files[index];
+ }
+
+ /**
+ * Helper method. Load a Dataset stored in the DistributedCache
+ *
+ * @param conf
+ * configuration
+ * @return loaded Dataset
+ * @throws IOException
+ * if we cannot retrieve the Dataset path from the DistributedCache, or the Dataset could not be
+ * loaded
+ */
+ public static Dataset loadDataset(Configuration conf) throws IOException {
+ Path datasetPath = getDistributedCacheFile(conf, 0);
+
+ return Dataset.load(conf, datasetPath);
+ }
+
+ /**
+ * Used by the inheriting classes to configure the job
+ *
+ *
+ * @param job
+ * Hadoop's Job
+ * @throws IOException
+ * if anything goes wrong while configuring the job
+ */
+ protected abstract void configureJob(Job job) throws IOException;
+
+ /**
+ * Sequential implementation should override this method to simulate the job execution
+ *
+ * @param job
+ * Hadoop's job
+ * @return true is the job succeeded
+ */
+ protected boolean runJob(Job job) throws ClassNotFoundException, IOException, InterruptedException {
+ return job.waitForCompletion(true);
+ }
+
+ /**
+ * Parse the output files to extract the trees and pass the predictions to the callback
+ *
+ * @param job
+ * Hadoop's job
+ * @return Built DecisionForest
+ * @throws IOException
+ * if anything goes wrong while parsing the output
+ */
+ protected abstract DecisionForest parseOutput(Job job) throws IOException;
+
+ public DecisionForest build(int nbTrees)
+ throws IOException, ClassNotFoundException, InterruptedException {
+ // int numTrees = getNbTrees(conf);
+
+ Path outputPath = getOutputPath(conf);
+ FileSystem fs = outputPath.getFileSystem(conf);
+
+ // check the output
+ if (fs.exists(outputPath)) {
+ throw new IOException("Output path already exists : " + outputPath);
+ }
+
+ if (seed != null) {
+ setRandomSeed(conf, seed);
+ }
+ setNbTrees(conf, nbTrees);
+ setTreeBuilder(conf, treeBuilder);
+
+ // put the dataset into the DistributedCache
+ DistributedCache.addCacheFile(datasetPath.toUri(), conf);
+
+ Job job = new Job(conf, "decision forest builder");
+
+ log.debug("Configuring the job...");
+ configureJob(job);
+
+ log.debug("Running the job...");
+ if (!runJob(job)) {
+ log.error("Job failed!");
+ return null;
+ }
+
+ if (isOutput(conf)) {
+ log.debug("Parsing the output...");
+ DecisionForest forest = parseOutput(job);
+ HadoopUtil.delete(conf, outputPath);
+ return forest;
+ }
+
+ return null;
+ }
+
+ /**
+ * sort the splits into order based on size, so that the biggest go first.<br>
+ * This is the same code used by Hadoop's JobClient.
+ *
+ * @param splits
+ * input splits
+ */
+ public static void sortSplits(InputSplit[] splits) {
+ Arrays.sort(splits, new Comparator<InputSplit>() {
+ @Override
+ public int compare(InputSplit a, InputSplit b) {
+ try {
+ long left = a.getLength();
+ long right = b.getLength();
+ if (left == right) {
+ return 0;
+ } else if (left < right) {
+ return 1;
+ } else {
+ return -1;
+ }
+ } catch (IOException ie) {
+ throw new IllegalStateException("Problem getting input split size", ie);
+ } catch (InterruptedException ie) {
+ throw new IllegalStateException("Problem getting input split size", ie);
+ }
+ }
+ });
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/Classifier.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/Classifier.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/Classifier.java
new file mode 100644
index 0000000..1a35cfe
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/Classifier.java
@@ -0,0 +1,238 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.mapreduce;
+
+import com.google.common.io.Closeables;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.filecache.DistributedCache;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.DoubleWritable;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.JobContext;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.FileSplit;
+import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.mahout.classifier.df.DFUtils;
+import org.apache.mahout.classifier.df.DecisionForest;
+import org.apache.mahout.classifier.df.data.DataConverter;
+import org.apache.mahout.classifier.df.data.Dataset;
+import org.apache.mahout.classifier.df.data.Instance;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Random;
+
+/**
+ * Mapreduce implementation that classifies the Input data using a previousely built decision forest
+ */
+@Deprecated
+public class Classifier {
+
+ private static final Logger log = LoggerFactory.getLogger(Classifier.class);
+
+ private final Path forestPath;
+ private final Path inputPath;
+ private final Path datasetPath;
+ private final Configuration conf;
+ private final Path outputPath; // path that will containt the final output of the classifier
+ private final Path mappersOutputPath; // mappers will output here
+ private double[][] results;
+
+ public double[][] getResults() {
+ return results;
+ }
+
+ public Classifier(Path forestPath,
+ Path inputPath,
+ Path datasetPath,
+ Path outputPath,
+ Configuration conf) {
+ this.forestPath = forestPath;
+ this.inputPath = inputPath;
+ this.datasetPath = datasetPath;
+ this.outputPath = outputPath;
+ this.conf = conf;
+
+ mappersOutputPath = new Path(outputPath, "mappers");
+ }
+
+ private void configureJob(Job job) throws IOException {
+
+ job.setJarByClass(Classifier.class);
+
+ FileInputFormat.setInputPaths(job, inputPath);
+ FileOutputFormat.setOutputPath(job, mappersOutputPath);
+
+ job.setOutputKeyClass(DoubleWritable.class);
+ job.setOutputValueClass(Text.class);
+
+ job.setMapperClass(CMapper.class);
+ job.setNumReduceTasks(0); // no reducers
+
+ job.setInputFormatClass(CTextInputFormat.class);
+ job.setOutputFormatClass(SequenceFileOutputFormat.class);
+
+ }
+
+ public void run() throws IOException, ClassNotFoundException, InterruptedException {
+ FileSystem fs = FileSystem.get(conf);
+
+ // check the output
+ if (fs.exists(outputPath)) {
+ throw new IOException("Output path already exists : " + outputPath);
+ }
+
+ log.info("Adding the dataset to the DistributedCache");
+ // put the dataset into the DistributedCache
+ DistributedCache.addCacheFile(datasetPath.toUri(), conf);
+
+ log.info("Adding the decision forest to the DistributedCache");
+ DistributedCache.addCacheFile(forestPath.toUri(), conf);
+
+ Job job = new Job(conf, "decision forest classifier");
+
+ log.info("Configuring the job...");
+ configureJob(job);
+
+ log.info("Running the job...");
+ if (!job.waitForCompletion(true)) {
+ throw new IllegalStateException("Job failed!");
+ }
+
+ parseOutput(job);
+
+ HadoopUtil.delete(conf, mappersOutputPath);
+ }
+
+ /**
+ * Extract the prediction for each mapper and write them in the corresponding output file.
+ * The name of the output file is based on the name of the corresponding input file.
+ * Will compute the ConfusionMatrix if necessary.
+ */
+ private void parseOutput(JobContext job) throws IOException {
+ Configuration conf = job.getConfiguration();
+ FileSystem fs = mappersOutputPath.getFileSystem(conf);
+
+ Path[] outfiles = DFUtils.listOutputFiles(fs, mappersOutputPath);
+
+ // read all the output
+ List<double[]> resList = new ArrayList<>();
+ for (Path path : outfiles) {
+ FSDataOutputStream ofile = null;
+ try {
+ for (Pair<DoubleWritable,Text> record : new SequenceFileIterable<DoubleWritable,Text>(path, true, conf)) {
+ double key = record.getFirst().get();
+ String value = record.getSecond().toString();
+ if (ofile == null) {
+ // this is the first value, it contains the name of the input file
+ ofile = fs.create(new Path(outputPath, value).suffix(".out"));
+ } else {
+ // The key contains the correct label of the data. The value contains a prediction
+ ofile.writeChars(value); // write the prediction
+ ofile.writeChar('\n');
+
+ resList.add(new double[]{key, Double.valueOf(value)});
+ }
+ }
+ } finally {
+ Closeables.close(ofile, false);
+ }
+ }
+ results = new double[resList.size()][2];
+ resList.toArray(results);
+ }
+
+ /**
+ * TextInputFormat that does not split the input files. This ensures that each input file is processed by one single
+ * mapper.
+ */
+ private static class CTextInputFormat extends TextInputFormat {
+ @Override
+ protected boolean isSplitable(JobContext jobContext, Path path) {
+ return false;
+ }
+ }
+
+ public static class CMapper extends Mapper<LongWritable, Text, DoubleWritable, Text> {
+
+ /** used to convert input values to data instances */
+ private DataConverter converter;
+ private DecisionForest forest;
+ private final Random rng = RandomUtils.getRandom();
+ private boolean first = true;
+ private final Text lvalue = new Text();
+ private Dataset dataset;
+ private final DoubleWritable lkey = new DoubleWritable();
+
+ @Override
+ protected void setup(Context context) throws IOException, InterruptedException {
+ super.setup(context); //To change body of overridden methods use File | Settings | File Templates.
+
+ Configuration conf = context.getConfiguration();
+
+ Path[] files = HadoopUtil.getCachedFiles(conf);
+
+ if (files.length < 2) {
+ throw new IOException("not enough paths in the DistributedCache");
+ }
+ dataset = Dataset.load(conf, files[0]);
+ converter = new DataConverter(dataset);
+
+ forest = DecisionForest.load(conf, files[1]);
+ if (forest == null) {
+ throw new InterruptedException("DecisionForest not found!");
+ }
+ }
+
+ @Override
+ protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
+ if (first) {
+ FileSplit split = (FileSplit) context.getInputSplit();
+ Path path = split.getPath(); // current split path
+ lvalue.set(path.getName());
+ lkey.set(key.get());
+ context.write(lkey, lvalue);
+
+ first = false;
+ }
+
+ String line = value.toString();
+ if (!line.isEmpty()) {
+ Instance instance = converter.convert(line);
+ double prediction = forest.classify(dataset, rng, instance);
+ lkey.set(dataset.getLabel(instance));
+ lvalue.set(Double.toString(prediction));
+ context.write(lkey, lvalue);
+ }
+ }
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/MapredMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/MapredMapper.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/MapredMapper.java
new file mode 100644
index 0000000..4d0f3f1
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/MapredMapper.java
@@ -0,0 +1,75 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.mapreduce;
+
+import com.google.common.base.Preconditions;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.classifier.df.builder.TreeBuilder;
+import org.apache.mahout.classifier.df.data.Dataset;
+
+import java.io.IOException;
+
+/**
+ * Base class for Mapred mappers. Loads common parameters from the job
+ */
+@Deprecated
+public class MapredMapper<KEYIN,VALUEIN,KEYOUT,VALUEOUT> extends Mapper<KEYIN,VALUEIN,KEYOUT,VALUEOUT> {
+
+ private boolean noOutput;
+
+ private TreeBuilder treeBuilder;
+
+ private Dataset dataset;
+
+ /**
+ *
+ * @return whether the mapper does estimate and output predictions
+ */
+ protected boolean isOutput() {
+ return !noOutput;
+ }
+
+ protected TreeBuilder getTreeBuilder() {
+ return treeBuilder;
+ }
+
+ protected Dataset getDataset() {
+ return dataset;
+ }
+
+ @Override
+ protected void setup(Context context) throws IOException, InterruptedException {
+ super.setup(context);
+
+ Configuration conf = context.getConfiguration();
+
+ configure(!Builder.isOutput(conf), Builder.getTreeBuilder(conf), Builder
+ .loadDataset(conf));
+ }
+
+ /**
+ * Useful for testing
+ */
+ protected void configure(boolean noOutput, TreeBuilder treeBuilder, Dataset dataset) {
+ Preconditions.checkArgument(treeBuilder != null, "TreeBuilder not found in the Job parameters");
+ this.noOutput = noOutput;
+ this.treeBuilder = treeBuilder;
+ this.dataset = dataset;
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/MapredOutput.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/MapredOutput.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/MapredOutput.java
new file mode 100644
index 0000000..56cabb2
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/MapredOutput.java
@@ -0,0 +1,120 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.mapreduce;
+
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.classifier.df.DFUtils;
+import org.apache.mahout.classifier.df.node.Node;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.Arrays;
+
+/**
+ * Used by various implementation to return the results of a build.<br>
+ * Contains a grown tree and and its oob predictions.
+ */
+@Deprecated
+public class MapredOutput implements Writable, Cloneable {
+
+ private Node tree;
+
+ private int[] predictions;
+
+ public MapredOutput() {
+ }
+
+ public MapredOutput(Node tree, int[] predictions) {
+ this.tree = tree;
+ this.predictions = predictions;
+ }
+
+ public MapredOutput(Node tree) {
+ this(tree, null);
+ }
+
+ public Node getTree() {
+ return tree;
+ }
+
+ int[] getPredictions() {
+ return predictions;
+ }
+
+ @Override
+ public void readFields(DataInput in) throws IOException {
+ boolean readTree = in.readBoolean();
+ if (readTree) {
+ tree = Node.read(in);
+ }
+
+ boolean readPredictions = in.readBoolean();
+ if (readPredictions) {
+ predictions = DFUtils.readIntArray(in);
+ }
+ }
+
+ @Override
+ public void write(DataOutput out) throws IOException {
+ out.writeBoolean(tree != null);
+ if (tree != null) {
+ tree.write(out);
+ }
+
+ out.writeBoolean(predictions != null);
+ if (predictions != null) {
+ DFUtils.writeArray(out, predictions);
+ }
+ }
+
+ @Override
+ public MapredOutput clone() {
+ return new MapredOutput(tree, predictions);
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj) {
+ return true;
+ }
+ if (!(obj instanceof MapredOutput)) {
+ return false;
+ }
+
+ MapredOutput mo = (MapredOutput) obj;
+
+ return ((tree == null && mo.getTree() == null) || (tree != null && tree.equals(mo.getTree())))
+ && Arrays.equals(predictions, mo.getPredictions());
+ }
+
+ @Override
+ public int hashCode() {
+ int hashCode = tree == null ? 1 : tree.hashCode();
+ for (int prediction : predictions) {
+ hashCode = 31 * hashCode + prediction;
+ }
+ return hashCode;
+ }
+
+ @Override
+ public String toString() {
+ return "{" + tree + " | " + Arrays.toString(predictions) + '}';
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/inmem/InMemBuilder.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/inmem/InMemBuilder.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/inmem/InMemBuilder.java
new file mode 100644
index 0000000..86d4404
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/inmem/InMemBuilder.java
@@ -0,0 +1,114 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.mapreduce.inmem;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.filecache.DistributedCache;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.mahout.classifier.df.DFUtils;
+import org.apache.mahout.classifier.df.DecisionForest;
+import org.apache.mahout.classifier.df.builder.TreeBuilder;
+import org.apache.mahout.classifier.df.mapreduce.Builder;
+import org.apache.mahout.classifier.df.mapreduce.MapredOutput;
+import org.apache.mahout.classifier.df.node.Node;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable;
+
+/**
+ * MapReduce implementation where each mapper loads a full copy of the data in-memory. The forest trees are
+ * splitted across all the mappers
+ */
+@Deprecated
+public class InMemBuilder extends Builder {
+
+ public InMemBuilder(TreeBuilder treeBuilder, Path dataPath, Path datasetPath, Long seed, Configuration conf) {
+ super(treeBuilder, dataPath, datasetPath, seed, conf);
+ }
+
+ public InMemBuilder(TreeBuilder treeBuilder, Path dataPath, Path datasetPath) {
+ this(treeBuilder, dataPath, datasetPath, null, new Configuration());
+ }
+
+ @Override
+ protected void configureJob(Job job) throws IOException {
+ Configuration conf = job.getConfiguration();
+
+ job.setJarByClass(InMemBuilder.class);
+
+ FileOutputFormat.setOutputPath(job, getOutputPath(conf));
+
+ // put the data in the DistributedCache
+ DistributedCache.addCacheFile(getDataPath().toUri(), conf);
+
+ job.setOutputKeyClass(IntWritable.class);
+ job.setOutputValueClass(MapredOutput.class);
+
+ job.setMapperClass(InMemMapper.class);
+ job.setNumReduceTasks(0); // no reducers
+
+ job.setInputFormatClass(InMemInputFormat.class);
+ job.setOutputFormatClass(SequenceFileOutputFormat.class);
+
+ }
+
+ @Override
+ protected DecisionForest parseOutput(Job job) throws IOException {
+ Configuration conf = job.getConfiguration();
+
+ Map<Integer,MapredOutput> output = new HashMap<>();
+
+ Path outputPath = getOutputPath(conf);
+ FileSystem fs = outputPath.getFileSystem(conf);
+
+ Path[] outfiles = DFUtils.listOutputFiles(fs, outputPath);
+
+ // import the InMemOutputs
+ for (Path path : outfiles) {
+ for (Pair<IntWritable,MapredOutput> record : new SequenceFileIterable<IntWritable,MapredOutput>(path, conf)) {
+ output.put(record.getFirst().get(), record.getSecond());
+ }
+ }
+
+ return processOutput(output);
+ }
+
+ /**
+ * Process the output, extracting the trees
+ */
+ private static DecisionForest processOutput(Map<Integer,MapredOutput> output) {
+ List<Node> trees = new ArrayList<>();
+
+ for (Map.Entry<Integer,MapredOutput> entry : output.entrySet()) {
+ MapredOutput value = entry.getValue();
+ trees.add(value.getTree());
+ }
+
+ return new DecisionForest(trees);
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/inmem/InMemInputFormat.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/inmem/InMemInputFormat.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/inmem/InMemInputFormat.java
new file mode 100644
index 0000000..c3b2fa3
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/inmem/InMemInputFormat.java
@@ -0,0 +1,284 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.mapreduce.inmem;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Locale;
+import java.util.Random;
+
+import com.google.common.base.Preconditions;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapreduce.InputFormat;
+import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.JobContext;
+import org.apache.hadoop.mapreduce.RecordReader;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+import org.apache.mahout.classifier.df.mapreduce.Builder;
+import org.apache.mahout.common.RandomUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Custom InputFormat that generates InputSplits given the desired number of trees.<br>
+ * each input split contains a subset of the trees.<br>
+ * The number of splits is equal to the number of requested splits
+ */
+@Deprecated
+public class InMemInputFormat extends InputFormat<IntWritable,NullWritable> {
+
+ private static final Logger log = LoggerFactory.getLogger(InMemInputSplit.class);
+
+ private Random rng;
+
+ private Long seed;
+
+ private boolean isSingleSeed;
+
+ /**
+ * Used for DEBUG purposes only. if true and a seed is available, all the mappers use the same seed, thus
+ * all the mapper should take the same time to build their trees.
+ */
+ private static boolean isSingleSeed(Configuration conf) {
+ return conf.getBoolean("debug.mahout.rf.single.seed", false);
+ }
+
+ @Override
+ public RecordReader<IntWritable,NullWritable> createRecordReader(InputSplit split, TaskAttemptContext context)
+ throws IOException, InterruptedException {
+ Preconditions.checkArgument(split instanceof InMemInputSplit);
+ return new InMemRecordReader((InMemInputSplit) split);
+ }
+
+ @Override
+ public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException {
+ Configuration conf = context.getConfiguration();
+ int numSplits = conf.getInt("mapred.map.tasks", -1);
+
+ return getSplits(conf, numSplits);
+ }
+
+ public List<InputSplit> getSplits(Configuration conf, int numSplits) {
+ int nbTrees = Builder.getNbTrees(conf);
+ int splitSize = nbTrees / numSplits;
+
+ seed = Builder.getRandomSeed(conf);
+ isSingleSeed = isSingleSeed(conf);
+
+ if (rng != null && seed != null) {
+ log.warn("getSplits() was called more than once and the 'seed' is set, "
+ + "this can lead to no-repeatable behavior");
+ }
+
+ rng = seed == null || isSingleSeed ? null : RandomUtils.getRandom(seed);
+
+ int id = 0;
+
+ List<InputSplit> splits = new ArrayList<>(numSplits);
+
+ for (int index = 0; index < numSplits - 1; index++) {
+ splits.add(new InMemInputSplit(id, splitSize, nextSeed()));
+ id += splitSize;
+ }
+
+ // take care of the remainder
+ splits.add(new InMemInputSplit(id, nbTrees - id, nextSeed()));
+
+ return splits;
+ }
+
+ /**
+ * @return the seed for the next InputSplit
+ */
+ private Long nextSeed() {
+ if (seed == null) {
+ return null;
+ } else if (isSingleSeed) {
+ return seed;
+ } else {
+ return rng.nextLong();
+ }
+ }
+
+ public static class InMemRecordReader extends RecordReader<IntWritable,NullWritable> {
+
+ private final InMemInputSplit split;
+ private int pos;
+ private IntWritable key;
+ private NullWritable value;
+
+ public InMemRecordReader(InMemInputSplit split) {
+ this.split = split;
+ }
+
+ @Override
+ public float getProgress() throws IOException {
+ return pos == 0 ? 0.0f : (float) (pos - 1) / split.nbTrees;
+ }
+
+ @Override
+ public IntWritable getCurrentKey() throws IOException, InterruptedException {
+ return key;
+ }
+
+ @Override
+ public NullWritable getCurrentValue() throws IOException, InterruptedException {
+ return value;
+ }
+
+ @Override
+ public void initialize(InputSplit arg0, TaskAttemptContext arg1) throws IOException, InterruptedException {
+ key = new IntWritable();
+ value = NullWritable.get();
+ }
+
+ @Override
+ public boolean nextKeyValue() throws IOException, InterruptedException {
+ if (pos < split.nbTrees) {
+ key.set(split.firstId + pos);
+ pos++;
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ @Override
+ public void close() throws IOException {
+ }
+
+ }
+
+ /**
+ * Custom InputSplit that indicates how many trees are built by each mapper
+ */
+ public static class InMemInputSplit extends InputSplit implements Writable {
+
+ private static final String[] NO_LOCATIONS = new String[0];
+
+ /** Id of the first tree of this split */
+ private int firstId;
+
+ private int nbTrees;
+
+ private Long seed;
+
+ public InMemInputSplit() { }
+
+ public InMemInputSplit(int firstId, int nbTrees, Long seed) {
+ this.firstId = firstId;
+ this.nbTrees = nbTrees;
+ this.seed = seed;
+ }
+
+ /**
+ * @return the Id of the first tree of this split
+ */
+ public int getFirstId() {
+ return firstId;
+ }
+
+ /**
+ * @return the number of trees
+ */
+ public int getNbTrees() {
+ return nbTrees;
+ }
+
+ /**
+ * @return the random seed or null if no seed is available
+ */
+ public Long getSeed() {
+ return seed;
+ }
+
+ @Override
+ public long getLength() throws IOException {
+ return nbTrees;
+ }
+
+ @Override
+ public String[] getLocations() throws IOException {
+ return NO_LOCATIONS;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj) {
+ return true;
+ }
+ if (!(obj instanceof InMemInputSplit)) {
+ return false;
+ }
+
+ InMemInputSplit split = (InMemInputSplit) obj;
+
+ if (firstId != split.firstId || nbTrees != split.nbTrees) {
+ return false;
+ }
+ if (seed == null) {
+ return split.seed == null;
+ } else {
+ return seed.equals(split.seed);
+ }
+
+ }
+
+ @Override
+ public int hashCode() {
+ return firstId + nbTrees + (seed == null ? 0 : seed.intValue());
+ }
+
+ @Override
+ public String toString() {
+ return String.format(Locale.ENGLISH, "[firstId:%d, nbTrees:%d, seed:%d]", firstId, nbTrees, seed);
+ }
+
+ @Override
+ public void readFields(DataInput in) throws IOException {
+ firstId = in.readInt();
+ nbTrees = in.readInt();
+ boolean isSeed = in.readBoolean();
+ seed = isSeed ? in.readLong() : null;
+ }
+
+ @Override
+ public void write(DataOutput out) throws IOException {
+ out.writeInt(firstId);
+ out.writeInt(nbTrees);
+ out.writeBoolean(seed != null);
+ if (seed != null) {
+ out.writeLong(seed);
+ }
+ }
+
+ public static InMemInputSplit read(DataInput in) throws IOException {
+ InMemInputSplit split = new InMemInputSplit();
+ split.readFields(in);
+ return split;
+ }
+
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/inmem/InMemMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/inmem/InMemMapper.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/inmem/InMemMapper.java
new file mode 100644
index 0000000..2fc67ba
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/inmem/InMemMapper.java
@@ -0,0 +1,106 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.mapreduce.inmem;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.mahout.classifier.df.Bagging;
+import org.apache.mahout.classifier.df.data.Data;
+import org.apache.mahout.classifier.df.data.DataLoader;
+import org.apache.mahout.classifier.df.data.Dataset;
+import org.apache.mahout.classifier.df.mapreduce.Builder;
+import org.apache.mahout.classifier.df.mapreduce.MapredMapper;
+import org.apache.mahout.classifier.df.mapreduce.MapredOutput;
+import org.apache.mahout.classifier.df.mapreduce.inmem.InMemInputFormat.InMemInputSplit;
+import org.apache.mahout.classifier.df.node.Node;
+import org.apache.mahout.common.RandomUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.Random;
+
+/**
+ * In-memory mapper that grows the trees using a full copy of the data loaded in-memory. The number of trees
+ * to grow is determined by the current InMemInputSplit.
+ */
+@Deprecated
+public class InMemMapper extends MapredMapper<IntWritable,NullWritable,IntWritable,MapredOutput> {
+
+ private static final Logger log = LoggerFactory.getLogger(InMemMapper.class);
+
+ private Bagging bagging;
+
+ private Random rng;
+
+ /**
+ * Load the training data
+ */
+ private static Data loadData(Configuration conf, Dataset dataset) throws IOException {
+ Path dataPath = Builder.getDistributedCacheFile(conf, 1);
+ FileSystem fs = FileSystem.get(dataPath.toUri(), conf);
+ return DataLoader.loadData(dataset, fs, dataPath);
+ }
+
+ @Override
+ protected void setup(Context context) throws IOException, InterruptedException {
+ super.setup(context);
+
+ Configuration conf = context.getConfiguration();
+
+ log.info("Loading the data...");
+ Data data = loadData(conf, getDataset());
+ log.info("Data loaded : {} instances", data.size());
+
+ bagging = new Bagging(getTreeBuilder(), data);
+ }
+
+ @Override
+ protected void map(IntWritable key,
+ NullWritable value,
+ Context context) throws IOException, InterruptedException {
+ map(key, context);
+ }
+
+ void map(IntWritable key, Context context) throws IOException, InterruptedException {
+
+ initRandom((InMemInputSplit) context.getInputSplit());
+
+ log.debug("Building...");
+ Node tree = bagging.build(rng);
+
+ if (isOutput()) {
+ log.debug("Outputing...");
+ MapredOutput mrOut = new MapredOutput(tree);
+
+ context.write(key, mrOut);
+ }
+ }
+
+ void initRandom(InMemInputSplit split) {
+ if (rng == null) { // first execution of this mapper
+ Long seed = split.getSeed();
+ log.debug("Initialising rng with seed : {}", seed);
+ rng = seed == null ? RandomUtils.getRandom() : RandomUtils.getRandom(seed);
+ }
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/inmem/package-info.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/inmem/package-info.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/inmem/package-info.java
new file mode 100644
index 0000000..61e65e8
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/inmem/package-info.java
@@ -0,0 +1,22 @@
+/**
+ * <h2>In-memory mapreduce implementation of Random Decision Forests</h2>
+ *
+ * <p>Each mapper is responsible for growing a number of trees with a whole copy of the dataset loaded in memory,
+ * it uses the reference implementation's code to build each tree and estimate the oob error.</p>
+ *
+ * <p>The dataset is distributed to the slave nodes using the {@link org.apache.hadoop.filecache.DistributedCache}.
+ * A custom {@link org.apache.hadoop.mapreduce.InputFormat}
+ * ({@link org.apache.mahout.classifier.df.mapreduce.inmem.InMemInputFormat}) is configured with the
+ * desired number of trees and generates a number of {@link org.apache.hadoop.mapreduce.InputSplit}s
+ * equal to the configured number of maps.</p>
+ *
+ * <p>There is no need for reducers, each map outputs (the trees it built and, for each tree, the labels the
+ * tree predicted for each out-of-bag instance. This step has to be done in the mapper because only there we
+ * know which instances are o-o-b.</p>
+ *
+ * <p>The Forest builder ({@link org.apache.mahout.classifier.df.mapreduce.inmem.InMemBuilder}) is responsible
+ * for configuring and launching the job.
+ * At the end of the job it parses the output files and builds the corresponding
+ * {@link org.apache.mahout.classifier.df.DecisionForest}.</p>
+ */
+package org.apache.mahout.classifier.df.mapreduce.inmem;

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/partial/PartialBuilder.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/partial/PartialBuilder.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/partial/PartialBuilder.java
new file mode 100644
index 0000000..9236af3
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/partial/PartialBuilder.java
@@ -0,0 +1,158 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.mapreduce.partial;
+
+import com.google.common.base.Preconditions;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.JobContext;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.mahout.classifier.df.DFUtils;
+import org.apache.mahout.classifier.df.DecisionForest;
+import org.apache.mahout.classifier.df.builder.TreeBuilder;
+import org.apache.mahout.classifier.df.mapreduce.Builder;
+import org.apache.mahout.classifier.df.mapreduce.MapredOutput;
+import org.apache.mahout.classifier.df.node.Node;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * Builds a random forest using partial data. Each mapper uses only the data given by its InputSplit
+ */
+@Deprecated
+public class PartialBuilder extends Builder {
+
+ private static final Logger log = LoggerFactory.getLogger(PartialBuilder.class);
+
+ public PartialBuilder(TreeBuilder treeBuilder, Path dataPath, Path datasetPath, Long seed) {
+ this(treeBuilder, dataPath, datasetPath, seed, new Configuration());
+ }
+
+ public PartialBuilder(TreeBuilder treeBuilder,
+ Path dataPath,
+ Path datasetPath,
+ Long seed,
+ Configuration conf) {
+ super(treeBuilder, dataPath, datasetPath, seed, conf);
+ }
+
+ @Override
+ protected void configureJob(Job job) throws IOException {
+ Configuration conf = job.getConfiguration();
+
+ job.setJarByClass(PartialBuilder.class);
+
+ FileInputFormat.setInputPaths(job, getDataPath());
+ FileOutputFormat.setOutputPath(job, getOutputPath(conf));
+
+ job.setOutputKeyClass(TreeID.class);
+ job.setOutputValueClass(MapredOutput.class);
+
+ job.setMapperClass(Step1Mapper.class);
+ job.setNumReduceTasks(0); // no reducers
+
+ job.setInputFormatClass(TextInputFormat.class);
+ job.setOutputFormatClass(SequenceFileOutputFormat.class);
+
+ // For this implementation to work, mapred.map.tasks needs to be set to the actual
+ // number of mappers Hadoop will use:
+ TextInputFormat inputFormat = new TextInputFormat();
+ List<?> splits = inputFormat.getSplits(job);
+ if (splits == null || splits.isEmpty()) {
+ log.warn("Unable to compute number of splits?");
+ } else {
+ int numSplits = splits.size();
+ log.info("Setting mapred.map.tasks = {}", numSplits);
+ conf.setInt("mapred.map.tasks", numSplits);
+ }
+ }
+
+ @Override
+ protected DecisionForest parseOutput(Job job) throws IOException {
+ Configuration conf = job.getConfiguration();
+
+ int numTrees = Builder.getNbTrees(conf);
+
+ Path outputPath = getOutputPath(conf);
+
+ TreeID[] keys = new TreeID[numTrees];
+ Node[] trees = new Node[numTrees];
+
+ processOutput(job, outputPath, keys, trees);
+
+ return new DecisionForest(Arrays.asList(trees));
+ }
+
+ /**
+ * Processes the output from the output path.<br>
+ *
+ * @param outputPath
+ * directory that contains the output of the job
+ * @param keys
+ * can be null
+ * @param trees
+ * can be null
+ * @throws java.io.IOException
+ */
+ protected static void processOutput(JobContext job,
+ Path outputPath,
+ TreeID[] keys,
+ Node[] trees) throws IOException {
+ Preconditions.checkArgument(keys == null && trees == null || keys != null && trees != null,
+ "if keys is null, trees should also be null");
+ Preconditions.checkArgument(keys == null || keys.length == trees.length, "keys.length != trees.length");
+
+ Configuration conf = job.getConfiguration();
+
+ FileSystem fs = outputPath.getFileSystem(conf);
+
+ Path[] outfiles = DFUtils.listOutputFiles(fs, outputPath);
+
+ // read all the outputs
+ int index = 0;
+ for (Path path : outfiles) {
+ for (Pair<TreeID,MapredOutput> record : new SequenceFileIterable<TreeID, MapredOutput>(path, conf)) {
+ TreeID key = record.getFirst();
+ MapredOutput value = record.getSecond();
+ if (keys != null) {
+ keys[index] = key;
+ }
+ if (trees != null) {
+ trees[index] = value.getTree();
+ }
+ index++;
+ }
+ }
+
+ // make sure we got all the keys/values
+ if (keys != null && index != keys.length) {
+ throw new IllegalStateException("Some key/values are missing from the output");
+ }
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/partial/Step1Mapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/partial/Step1Mapper.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/partial/Step1Mapper.java
new file mode 100644
index 0000000..9474236
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/partial/Step1Mapper.java
@@ -0,0 +1,168 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.mapreduce.partial;
+
+import com.google.common.base.Preconditions;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.mahout.classifier.df.Bagging;
+import org.apache.mahout.classifier.df.data.Data;
+import org.apache.mahout.classifier.df.data.DataConverter;
+import org.apache.mahout.classifier.df.data.Instance;
+import org.apache.mahout.classifier.df.mapreduce.Builder;
+import org.apache.mahout.classifier.df.mapreduce.MapredMapper;
+import org.apache.mahout.classifier.df.mapreduce.MapredOutput;
+import org.apache.mahout.classifier.df.node.Node;
+import org.apache.mahout.common.RandomUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Random;
+
+/**
+ * First step of the Partial Data Builder. Builds the trees using the data available in the InputSplit.
+ * Predict the oob classes for each tree in its growing partition (input split).
+ */
+@Deprecated
+public class Step1Mapper extends MapredMapper<LongWritable,Text,TreeID,MapredOutput> {
+
+ private static final Logger log = LoggerFactory.getLogger(Step1Mapper.class);
+
+ /** used to convert input values to data instances */
+ private DataConverter converter;
+
+ private Random rng;
+
+ /** number of trees to be built by this mapper */
+ private int nbTrees;
+
+ /** id of the first tree */
+ private int firstTreeId;
+
+ /** mapper's partition */
+ private int partition;
+
+ /** will contain all instances if this mapper's split */
+ private final List<Instance> instances = new ArrayList<>();
+
+ public int getFirstTreeId() {
+ return firstTreeId;
+ }
+
+ @Override
+ protected void setup(Context context) throws IOException, InterruptedException {
+ super.setup(context);
+ Configuration conf = context.getConfiguration();
+
+ configure(Builder.getRandomSeed(conf), conf.getInt("mapred.task.partition", -1),
+ Builder.getNumMaps(conf), Builder.getNbTrees(conf));
+ }
+
+ /**
+ * Useful when testing
+ *
+ * @param partition
+ * current mapper inputSplit partition
+ * @param numMapTasks
+ * number of running map tasks
+ * @param numTrees
+ * total number of trees in the forest
+ */
+ protected void configure(Long seed, int partition, int numMapTasks, int numTrees) {
+ converter = new DataConverter(getDataset());
+
+ // prepare random-numders generator
+ log.debug("seed : {}", seed);
+ if (seed == null) {
+ rng = RandomUtils.getRandom();
+ } else {
+ rng = RandomUtils.getRandom(seed);
+ }
+
+ // mapper's partition
+ Preconditions.checkArgument(partition >= 0, "Wrong partition ID: " + partition + ". Partition must be >= 0!");
+ this.partition = partition;
+
+ // compute number of trees to build
+ nbTrees = nbTrees(numMapTasks, numTrees, partition);
+
+ // compute first tree id
+ firstTreeId = 0;
+ for (int p = 0; p < partition; p++) {
+ firstTreeId += nbTrees(numMapTasks, numTrees, p);
+ }
+
+ log.debug("partition : {}", partition);
+ log.debug("nbTrees : {}", nbTrees);
+ log.debug("firstTreeId : {}", firstTreeId);
+ }
+
+ /**
+ * Compute the number of trees for a given partition. The first partitions may be longer
+ * than the rest because of the remainder.
+ *
+ * @param numMaps
+ * total number of maps (partitions)
+ * @param numTrees
+ * total number of trees to build
+ * @param partition
+ * partition to compute the number of trees for
+ */
+ public static int nbTrees(int numMaps, int numTrees, int partition) {
+ int treesPerMapper = numTrees / numMaps;
+ int remainder = numTrees - numMaps * treesPerMapper;
+ return treesPerMapper + (partition < remainder ? 1 : 0);
+ }
+
+ @Override
+ protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
+ instances.add(converter.convert(value.toString()));
+ }
+
+ @Override
+ protected void cleanup(Context context) throws IOException, InterruptedException {
+ // prepare the data
+ log.debug("partition: {} numInstances: {}", partition, instances.size());
+
+ Data data = new Data(getDataset(), instances);
+ Bagging bagging = new Bagging(getTreeBuilder(), data);
+
+ TreeID key = new TreeID();
+
+ log.debug("Building {} trees", nbTrees);
+ for (int treeId = 0; treeId < nbTrees; treeId++) {
+ log.debug("Building tree number : {}", treeId);
+
+ Node tree = bagging.build(rng);
+
+ key.set(partition, firstTreeId + treeId);
+
+ if (isOutput()) {
+ MapredOutput emOut = new MapredOutput(tree);
+ context.write(key, emOut);
+ }
+
+ context.progress();
+ }
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/partial/TreeID.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/partial/TreeID.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/partial/TreeID.java
new file mode 100644
index 0000000..c296061
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/partial/TreeID.java
@@ -0,0 +1,58 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.mapreduce.partial;
+
+import com.google.common.base.Preconditions;
+import org.apache.hadoop.io.LongWritable;
+
+/**
+ * Indicates both the tree and the data partition used to grow the tree
+ */
+@Deprecated
+public class TreeID extends LongWritable implements Cloneable {
+
+ public static final int MAX_TREEID = 100000;
+
+ public TreeID() { }
+
+ public TreeID(int partition, int treeId) {
+ Preconditions.checkArgument(partition >= 0, "Wrong partition: " + partition + ". Partition must be >= 0!");
+ Preconditions.checkArgument(treeId >= 0, "Wrong treeId: " + treeId + ". TreeId must be >= 0!");
+ set(partition, treeId);
+ }
+
+ public void set(int partition, int treeId) {
+ set((long) partition * MAX_TREEID + treeId);
+ }
+
+ /**
+ * Data partition (InputSplit's index) that was used to grow the tree
+ */
+ public int partition() {
+ return (int) (get() / MAX_TREEID);
+ }
+
+ public int treeId() {
+ return (int) (get() % MAX_TREEID);
+ }
+
+ @Override
+ public TreeID clone() {
+ return new TreeID(partition(), treeId());
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/partial/package-info.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/partial/package-info.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/partial/package-info.java
new file mode 100644
index 0000000..e621c91
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/mapreduce/partial/package-info.java
@@ -0,0 +1,16 @@
+/**
+ * <h2>Partial-data mapreduce implementation of Random Decision Forests</h2>
+ *
+ * <p>The builder splits the data, using a FileInputSplit, among the mappers.
+ * Building the forest and estimating the oob error takes two job steps.</p>
+ *
+ * <p>In the first step, each mapper is responsible for growing a number of trees with its partition's,
+ * loading the data instances in its {@code map()} function, then building the trees in the {@code close()} method. It
+ * uses the reference implementation's code to build each tree and estimate the oob error.</p>
+ *
+ * <p>The second step is needed when estimating the oob error. Each mapper loads all the trees that does not
+ * belong to its own partition (were not built using the partition's data) and uses them to classify the
+ * partition's data instances. The data instances are loaded in the {@code map()} method and the classification
+ * is performed in the {@code close()} method.</p>
+ */
+package org.apache.mahout.classifier.df.mapreduce.partial;

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/node/CategoricalNode.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/node/CategoricalNode.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/node/CategoricalNode.java
new file mode 100644
index 0000000..1f91842
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/node/CategoricalNode.java
@@ -0,0 +1,134 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.node;
+
+import org.apache.commons.lang3.ArrayUtils;
+import org.apache.mahout.classifier.df.DFUtils;
+import org.apache.mahout.classifier.df.data.Instance;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.Arrays;
+@Deprecated
+public class CategoricalNode extends Node {
+
+ private int attr;
+ private double[] values;
+ private Node[] childs;
+
+ public CategoricalNode() {
+ }
+
+ public CategoricalNode(int attr, double[] values, Node[] childs) {
+ this.attr = attr;
+ this.values = values;
+ this.childs = childs;
+ }
+
+ @Override
+ public double classify(Instance instance) {
+ int index = ArrayUtils.indexOf(values, instance.get(attr));
+ if (index == -1) {
+ // value not available, we cannot predict
+ return Double.NaN;
+ }
+ return childs[index].classify(instance);
+ }
+
+ @Override
+ public long maxDepth() {
+ long max = 0;
+
+ for (Node child : childs) {
+ long depth = child.maxDepth();
+ if (depth > max) {
+ max = depth;
+ }
+ }
+
+ return 1 + max;
+ }
+
+ @Override
+ public long nbNodes() {
+ long nbNodes = 1;
+
+ for (Node child : childs) {
+ nbNodes += child.nbNodes();
+ }
+
+ return nbNodes;
+ }
+
+ @Override
+ protected Type getType() {
+ return Type.CATEGORICAL;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj) {
+ return true;
+ }
+ if (!(obj instanceof CategoricalNode)) {
+ return false;
+ }
+
+ CategoricalNode node = (CategoricalNode) obj;
+
+ return attr == node.attr && Arrays.equals(values, node.values) && Arrays.equals(childs, node.childs);
+ }
+
+ @Override
+ public int hashCode() {
+ int hashCode = attr;
+ for (double value : values) {
+ hashCode = 31 * hashCode + (int) Double.doubleToLongBits(value);
+ }
+ for (Node node : childs) {
+ hashCode = 31 * hashCode + node.hashCode();
+ }
+ return hashCode;
+ }
+
+ @Override
+ protected String getString() {
+ StringBuilder buffer = new StringBuilder();
+
+ for (Node child : childs) {
+ buffer.append(child).append(',');
+ }
+
+ return buffer.toString();
+ }
+
+ @Override
+ public void readFields(DataInput in) throws IOException {
+ attr = in.readInt();
+ values = DFUtils.readDoubleArray(in);
+ childs = DFUtils.readNodeArray(in);
+ }
+
+ @Override
+ protected void writeNode(DataOutput out) throws IOException {
+ out.writeInt(attr);
+ DFUtils.writeArray(out, values);
+ DFUtils.writeArray(out, childs);
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/node/Leaf.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/node/Leaf.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/node/Leaf.java
new file mode 100644
index 0000000..3360bb5
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/node/Leaf.java
@@ -0,0 +1,95 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.node;
+
+import org.apache.mahout.classifier.df.data.Instance;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+/**
+ * Represents a Leaf node
+ */
+@Deprecated
+public class Leaf extends Node {
+ private static final double EPSILON = 1.0e-6;
+
+ private double label;
+
+ Leaf() { }
+
+ public Leaf(double label) {
+ this.label = label;
+ }
+
+ @Override
+ public double classify(Instance instance) {
+ return label;
+ }
+
+ @Override
+ public long maxDepth() {
+ return 1;
+ }
+
+ @Override
+ public long nbNodes() {
+ return 1;
+ }
+
+ @Override
+ protected Type getType() {
+ return Type.LEAF;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj) {
+ return true;
+ }
+ if (!(obj instanceof Leaf)) {
+ return false;
+ }
+
+ Leaf leaf = (Leaf) obj;
+
+ return Math.abs(label - leaf.label) < EPSILON;
+ }
+
+ @Override
+ public int hashCode() {
+ long bits = Double.doubleToLongBits(label);
+ return (int)(bits ^ (bits >>> 32));
+ }
+
+ @Override
+ protected String getString() {
+ return "";
+ }
+
+ @Override
+ public void readFields(DataInput in) throws IOException {
+ label = in.readDouble();
+ }
+
+ @Override
+ protected void writeNode(DataOutput out) throws IOException {
+ out.writeDouble(label);
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/node/Node.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/node/Node.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/node/Node.java
new file mode 100644
index 0000000..73d516d
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/node/Node.java
@@ -0,0 +1,96 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.node;
+
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.classifier.df.data.Instance;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+/**
+ * Represents an abstract node of a decision tree
+ */
+@Deprecated
+public abstract class Node implements Writable {
+
+ protected enum Type {
+ LEAF,
+ NUMERICAL,
+ CATEGORICAL
+ }
+
+ /**
+ * predicts the label for the instance
+ *
+ * @return -1 if the label cannot be predicted
+ */
+ public abstract double classify(Instance instance);
+
+ /**
+ * @return the total number of nodes of the tree
+ */
+ public abstract long nbNodes();
+
+ /**
+ * @return the maximum depth of the tree
+ */
+ public abstract long maxDepth();
+
+ protected abstract Type getType();
+
+ public static Node read(DataInput in) throws IOException {
+ Type type = Type.values()[in.readInt()];
+ Node node;
+
+ switch (type) {
+ case LEAF:
+ node = new Leaf();
+ break;
+ case NUMERICAL:
+ node = new NumericalNode();
+ break;
+ case CATEGORICAL:
+ node = new CategoricalNode();
+ break;
+ default:
+ throw new IllegalStateException("This implementation is not currently supported");
+ }
+
+ node.readFields(in);
+
+ return node;
+ }
+
+ @Override
+ public final String toString() {
+ return getType() + ":" + getString() + ';';
+ }
+
+ protected abstract String getString();
+
+ @Override
+ public final void write(DataOutput out) throws IOException {
+ out.writeInt(getType().ordinal());
+ writeNode(out);
+ }
+
+ protected abstract void writeNode(DataOutput out) throws IOException;
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/node/NumericalNode.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/node/NumericalNode.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/node/NumericalNode.java
new file mode 100644
index 0000000..aa02089
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/node/NumericalNode.java
@@ -0,0 +1,115 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.node;
+
+import org.apache.mahout.classifier.df.data.Instance;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+/**
+ * Represents a node that splits using a numerical attribute
+ */
+@Deprecated
+public class NumericalNode extends Node {
+ /** numerical attribute to split for */
+ private int attr;
+
+ /** split value */
+ private double split;
+
+ /** child node when attribute's value < split value */
+ private Node loChild;
+
+ /** child node when attribute's value >= split value */
+ private Node hiChild;
+
+ public NumericalNode() { }
+
+ public NumericalNode(int attr, double split, Node loChild, Node hiChild) {
+ this.attr = attr;
+ this.split = split;
+ this.loChild = loChild;
+ this.hiChild = hiChild;
+ }
+
+ @Override
+ public double classify(Instance instance) {
+ if (instance.get(attr) < split) {
+ return loChild.classify(instance);
+ } else {
+ return hiChild.classify(instance);
+ }
+ }
+
+ @Override
+ public long maxDepth() {
+ return 1 + Math.max(loChild.maxDepth(), hiChild.maxDepth());
+ }
+
+ @Override
+ public long nbNodes() {
+ return 1 + loChild.nbNodes() + hiChild.nbNodes();
+ }
+
+ @Override
+ protected Type getType() {
+ return Type.NUMERICAL;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj) {
+ return true;
+ }
+ if (!(obj instanceof NumericalNode)) {
+ return false;
+ }
+
+ NumericalNode node = (NumericalNode) obj;
+
+ return attr == node.attr && split == node.split && loChild.equals(node.loChild) && hiChild.equals(node.hiChild);
+ }
+
+ @Override
+ public int hashCode() {
+ return attr + (int) Double.doubleToLongBits(split) + loChild.hashCode() + hiChild.hashCode();
+ }
+
+ @Override
+ protected String getString() {
+ return loChild.toString() + ',' + hiChild.toString();
+ }
+
+ @Override
+ public void readFields(DataInput in) throws IOException {
+ attr = in.readInt();
+ split = in.readDouble();
+ loChild = Node.read(in);
+ hiChild = Node.read(in);
+ }
+
+ @Override
+ protected void writeNode(DataOutput out) throws IOException {
+ out.writeInt(attr);
+ out.writeDouble(split);
+ loChild.write(out);
+ hiChild.write(out);
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/ref/SequentialBuilder.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/ref/SequentialBuilder.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/ref/SequentialBuilder.java
new file mode 100644
index 0000000..7ef907e
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/ref/SequentialBuilder.java
@@ -0,0 +1,78 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.ref;
+
+import org.apache.mahout.classifier.df.Bagging;
+import org.apache.mahout.classifier.df.DecisionForest;
+import org.apache.mahout.classifier.df.builder.TreeBuilder;
+import org.apache.mahout.classifier.df.data.Data;
+import org.apache.mahout.classifier.df.node.Node;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Random;
+
+/**
+ * Builds a Random Decision Forest using a given TreeBuilder to grow the trees
+ */
+@Deprecated
+public class SequentialBuilder {
+
+ private static final Logger log = LoggerFactory.getLogger(SequentialBuilder.class);
+
+ private final Random rng;
+
+ private final Bagging bagging;
+
+ /**
+ * Constructor
+ *
+ * @param rng
+ * random-numbers generator
+ * @param treeBuilder
+ * tree builder
+ * @param data
+ * training data
+ */
+ public SequentialBuilder(Random rng, TreeBuilder treeBuilder, Data data) {
+ this.rng = rng;
+ bagging = new Bagging(treeBuilder, data);
+ }
+
+ public DecisionForest build(int nbTrees) {
+ List<Node> trees = new ArrayList<>();
+
+ for (int treeId = 0; treeId < nbTrees; treeId++) {
+ trees.add(bagging.build(rng));
+ logProgress(((float) treeId + 1) / nbTrees);
+ }
+
+ return new DecisionForest(trees);
+ }
+
+ private static void logProgress(float progress) {
+ int percent = (int) (progress * 100);
+ if (percent % 10 == 0) {
+ log.info("Building {}%", percent);
+ }
+
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/split/DefaultIgSplit.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/split/DefaultIgSplit.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/split/DefaultIgSplit.java
new file mode 100644
index 0000000..3f1cfdf
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/split/DefaultIgSplit.java
@@ -0,0 +1,118 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.split;
+
+import org.apache.mahout.classifier.df.data.Data;
+import org.apache.mahout.classifier.df.data.conditions.Condition;
+
+import java.util.Arrays;
+
+/**
+ * Default, not optimized, implementation of IgSplit
+ */
+@Deprecated
+public class DefaultIgSplit extends IgSplit {
+
+ /** used by entropy() */
+ private int[] counts;
+
+ @Override
+ public Split computeSplit(Data data, int attr) {
+ if (data.getDataset().isNumerical(attr)) {
+ double[] values = data.values(attr);
+ double bestIg = -1;
+ double bestSplit = 0.0;
+
+ for (double value : values) {
+ double ig = numericalIg(data, attr, value);
+ if (ig > bestIg) {
+ bestIg = ig;
+ bestSplit = value;
+ }
+ }
+
+ return new Split(attr, bestIg, bestSplit);
+ } else {
+ double ig = categoricalIg(data, attr);
+
+ return new Split(attr, ig);
+ }
+ }
+
+ /**
+ * Computes the Information Gain for a CATEGORICAL attribute
+ */
+ double categoricalIg(Data data, int attr) {
+ double[] values = data.values(attr);
+ double hy = entropy(data); // H(Y)
+ double hyx = 0.0; // H(Y|X)
+ double invDataSize = 1.0 / data.size();
+
+ for (double value : values) {
+ Data subset = data.subset(Condition.equals(attr, value));
+ hyx += subset.size() * invDataSize * entropy(subset);
+ }
+
+ return hy - hyx;
+ }
+
+ /**
+ * Computes the Information Gain for a NUMERICAL attribute given a splitting value
+ */
+ double numericalIg(Data data, int attr, double split) {
+ double hy = entropy(data);
+ double invDataSize = 1.0 / data.size();
+
+ // LO subset
+ Data subset = data.subset(Condition.lesser(attr, split));
+ hy -= subset.size() * invDataSize * entropy(subset);
+
+ // HI subset
+ subset = data.subset(Condition.greaterOrEquals(attr, split));
+ hy -= subset.size() * invDataSize * entropy(subset);
+
+ return hy;
+ }
+
+ /**
+ * Computes the Entropy
+ */
+ protected double entropy(Data data) {
+ double invDataSize = 1.0 / data.size();
+
+ if (counts == null) {
+ counts = new int[data.getDataset().nblabels()];
+ }
+
+ Arrays.fill(counts, 0);
+ data.countLabels(counts);
+
+ double entropy = 0.0;
+ for (int label = 0; label < data.getDataset().nblabels(); label++) {
+ int count = counts[label];
+ if (count == 0) {
+ continue; // otherwise we get a NaN
+ }
+ double p = count * invDataSize;
+ entropy += -p * Math.log(p) / LOG2;
+ }
+
+ return entropy;
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/split/IgSplit.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/split/IgSplit.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/split/IgSplit.java
new file mode 100644
index 0000000..aff94e1
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/split/IgSplit.java
@@ -0,0 +1,35 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.split;
+
+import org.apache.mahout.classifier.df.data.Data;
+
+/**
+ * Computes the best split using the Information Gain measure
+ */
+@Deprecated
+public abstract class IgSplit {
+
+ static final double LOG2 = Math.log(2.0);
+
+ /**
+ * Computes the best split for the given attribute
+ */
+ public abstract Split computeSplit(Data data, int attr);
+
+}
r***@apache.org
2018-06-28 14:54:40 UTC
Permalink
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/NaiveBayesModel.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/NaiveBayesModel.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/NaiveBayesModel.java
new file mode 100644
index 0000000..9f85aab
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/NaiveBayesModel.java
@@ -0,0 +1,170 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.naivebayes;
+
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.Matrix;
+import org.apache.mahout.math.SparseRowMatrix;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+
+import com.google.common.base.Preconditions;
+
+/** NaiveBayesModel holds the weight matrix, the feature and label sums and the weight normalizer vectors.*/
+public class NaiveBayesModel {
+
+ private final Vector weightsPerLabel;
+ private final Vector perlabelThetaNormalizer;
+ private final Vector weightsPerFeature;
+ private final Matrix weightsPerLabelAndFeature;
+ private final float alphaI;
+ private final double numFeatures;
+ private final double totalWeightSum;
+ private final boolean isComplementary;
+
+ public final static String COMPLEMENTARY_MODEL = "COMPLEMENTARY_MODEL";
+
+ public NaiveBayesModel(Matrix weightMatrix, Vector weightsPerFeature, Vector weightsPerLabel, Vector thetaNormalizer,
+ float alphaI, boolean isComplementary) {
+ this.weightsPerLabelAndFeature = weightMatrix;
+ this.weightsPerFeature = weightsPerFeature;
+ this.weightsPerLabel = weightsPerLabel;
+ this.perlabelThetaNormalizer = thetaNormalizer;
+ this.numFeatures = weightsPerFeature.getNumNondefaultElements();
+ this.totalWeightSum = weightsPerLabel.zSum();
+ this.alphaI = alphaI;
+ this.isComplementary=isComplementary;
+ }
+
+ public double labelWeight(int label) {
+ return weightsPerLabel.getQuick(label);
+ }
+
+ public double thetaNormalizer(int label) {
+ return perlabelThetaNormalizer.get(label);
+ }
+
+ public double featureWeight(int feature) {
+ return weightsPerFeature.getQuick(feature);
+ }
+
+ public double weight(int label, int feature) {
+ return weightsPerLabelAndFeature.getQuick(label, feature);
+ }
+
+ public float alphaI() {
+ return alphaI;
+ }
+
+ public double numFeatures() {
+ return numFeatures;
+ }
+
+ public double totalWeightSum() {
+ return totalWeightSum;
+ }
+
+ public int numLabels() {
+ return weightsPerLabel.size();
+ }
+
+ public Vector createScoringVector() {
+ return weightsPerLabel.like();
+ }
+
+ public boolean isComplemtary(){
+ return isComplementary;
+ }
+
+ public static NaiveBayesModel materialize(Path output, Configuration conf) throws IOException {
+ FileSystem fs = output.getFileSystem(conf);
+
+ Vector weightsPerLabel;
+ Vector perLabelThetaNormalizer = null;
+ Vector weightsPerFeature;
+ Matrix weightsPerLabelAndFeature;
+ float alphaI;
+ boolean isComplementary;
+
+ try (FSDataInputStream in = fs.open(new Path(output, "naiveBayesModel.bin"))) {
+ alphaI = in.readFloat();
+ isComplementary = in.readBoolean();
+ weightsPerFeature = VectorWritable.readVector(in);
+ weightsPerLabel = new DenseVector(VectorWritable.readVector(in));
+ if (isComplementary){
+ perLabelThetaNormalizer = new DenseVector(VectorWritable.readVector(in));
+ }
+ weightsPerLabelAndFeature = new SparseRowMatrix(weightsPerLabel.size(), weightsPerFeature.size());
+ for (int label = 0; label < weightsPerLabelAndFeature.numRows(); label++) {
+ weightsPerLabelAndFeature.assignRow(label, VectorWritable.readVector(in));
+ }
+ }
+
+ NaiveBayesModel model = new NaiveBayesModel(weightsPerLabelAndFeature, weightsPerFeature, weightsPerLabel,
+ perLabelThetaNormalizer, alphaI, isComplementary);
+ model.validate();
+ return model;
+ }
+
+ public void serialize(Path output, Configuration conf) throws IOException {
+ FileSystem fs = output.getFileSystem(conf);
+ try (FSDataOutputStream out = fs.create(new Path(output, "naiveBayesModel.bin"))) {
+ out.writeFloat(alphaI);
+ out.writeBoolean(isComplementary);
+ VectorWritable.writeVector(out, weightsPerFeature);
+ VectorWritable.writeVector(out, weightsPerLabel);
+ if (isComplementary){
+ VectorWritable.writeVector(out, perlabelThetaNormalizer);
+ }
+ for (int row = 0; row < weightsPerLabelAndFeature.numRows(); row++) {
+ VectorWritable.writeVector(out, weightsPerLabelAndFeature.viewRow(row));
+ }
+ }
+ }
+
+ public void validate() {
+ Preconditions.checkState(alphaI > 0, "alphaI has to be greater than 0!");
+ Preconditions.checkArgument(numFeatures > 0, "the vocab count has to be greater than 0!");
+ Preconditions.checkArgument(totalWeightSum > 0, "the totalWeightSum has to be greater than 0!");
+ Preconditions.checkNotNull(weightsPerLabel, "the number of labels has to be defined!");
+ Preconditions.checkArgument(weightsPerLabel.getNumNondefaultElements() > 0,
+ "the number of labels has to be greater than 0!");
+ Preconditions.checkNotNull(weightsPerFeature, "the feature sums have to be defined");
+ Preconditions.checkArgument(weightsPerFeature.getNumNondefaultElements() > 0,
+ "the feature sums have to be greater than 0!");
+ if (isComplementary){
+ Preconditions.checkArgument(perlabelThetaNormalizer != null, "the theta normalizers have to be defined");
+ Preconditions.checkArgument(perlabelThetaNormalizer.getNumNondefaultElements() > 0,
+ "the number of theta normalizers has to be greater than 0!");
+ Preconditions.checkArgument(Math.signum(perlabelThetaNormalizer.minValue())
+ == Math.signum(perlabelThetaNormalizer.maxValue()),
+ "Theta normalizers do not all have the same sign");
+ Preconditions.checkArgument(perlabelThetaNormalizer.getNumNonZeroElements()
+ == perlabelThetaNormalizer.size(),
+ "Theta normalizers can not have zero value.");
+ }
+
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/StandardNaiveBayesClassifier.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/StandardNaiveBayesClassifier.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/StandardNaiveBayesClassifier.java
new file mode 100644
index 0000000..e4ce8aa
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/StandardNaiveBayesClassifier.java
@@ -0,0 +1,40 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.naivebayes;
+
+
+/** Implementation of the Naive Bayes Classifier Algorithm */
+public class StandardNaiveBayesClassifier extends AbstractNaiveBayesClassifier {
+
+ public StandardNaiveBayesClassifier(NaiveBayesModel model) {
+ super(model);
+ }
+
+ @Override
+ public double getScoreForLabelFeature(int label, int feature) {
+ NaiveBayesModel model = getModel();
+ // Standard Naive Bayes does not use weight normalization
+ return computeWeight(model.weight(label, feature), model.labelWeight(label), model.alphaI(), model.numFeatures());
+ }
+
+ public static double computeWeight(double featureLabelWeight, double labelWeight, double alphaI, double numFeatures) {
+ double numerator = featureLabelWeight + alphaI;
+ double denominator = labelWeight + alphaI * numFeatures;
+ return Math.log(numerator / denominator);
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/test/BayesTestMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/test/BayesTestMapper.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/test/BayesTestMapper.java
new file mode 100644
index 0000000..37a3b71
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/test/BayesTestMapper.java
@@ -0,0 +1,76 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.naivebayes.test;
+
+import com.google.common.base.Preconditions;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.classifier.naivebayes.AbstractNaiveBayesClassifier;
+import org.apache.mahout.classifier.naivebayes.ComplementaryNaiveBayesClassifier;
+import org.apache.mahout.classifier.naivebayes.NaiveBayesModel;
+import org.apache.mahout.classifier.naivebayes.StandardNaiveBayesClassifier;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+
+import java.io.IOException;
+import java.util.regex.Pattern;
+
+/**
+ * Run the input through the model and see if it matches.
+ * <p/>
+ * The output value is the generated label, the Pair is the expected label and true if they match:
+ */
+public class BayesTestMapper extends Mapper<Text, VectorWritable, Text, VectorWritable> {
+
+ private static final Pattern SLASH = Pattern.compile("/");
+
+ private AbstractNaiveBayesClassifier classifier;
+
+ @Override
+ protected void setup(Context context) throws IOException, InterruptedException {
+ super.setup(context);
+ Configuration conf = context.getConfiguration();
+ Path modelPath = HadoopUtil.getSingleCachedFile(conf);
+ NaiveBayesModel model = NaiveBayesModel.materialize(modelPath, conf);
+ boolean isComplementary = Boolean.parseBoolean(conf.get(TestNaiveBayesDriver.COMPLEMENTARY));
+
+ // ensure that if we are testing in complementary mode, the model has been
+ // trained complementary. a complementarty model will work for standard classification
+ // a standard model will not work for complementary classification
+ if (isComplementary) {
+ Preconditions.checkArgument((model.isComplemtary()),
+ "Complementary mode in model is different than test mode");
+ }
+
+ if (isComplementary) {
+ classifier = new ComplementaryNaiveBayesClassifier(model);
+ } else {
+ classifier = new StandardNaiveBayesClassifier(model);
+ }
+ }
+
+ @Override
+ protected void map(Text key, VectorWritable value, Context context) throws IOException, InterruptedException {
+ Vector result = classifier.classifyFull(value.get());
+ //the key is the expected value
+ context.write(new Text(SLASH.split(key.toString())[1]), new VectorWritable(result));
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/test/TestNaiveBayesDriver.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/test/TestNaiveBayesDriver.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/test/TestNaiveBayesDriver.java
new file mode 100644
index 0000000..d9eedcf
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/test/TestNaiveBayesDriver.java
@@ -0,0 +1,176 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.naivebayes.test;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Pattern;
+
+import com.google.common.base.Preconditions;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.classifier.ClassifierResult;
+import org.apache.mahout.classifier.ResultAnalyzer;
+import org.apache.mahout.classifier.naivebayes.AbstractNaiveBayesClassifier;
+import org.apache.mahout.classifier.naivebayes.BayesUtils;
+import org.apache.mahout.classifier.naivebayes.ComplementaryNaiveBayesClassifier;
+import org.apache.mahout.classifier.naivebayes.NaiveBayesModel;
+import org.apache.mahout.classifier.naivebayes.StandardNaiveBayesClassifier;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
+import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Test the (Complementary) Naive Bayes model that was built during training
+ * by running the iterating the test set and comparing it to the model
+ */
+public class TestNaiveBayesDriver extends AbstractJob {
+
+ private static final Logger log = LoggerFactory.getLogger(TestNaiveBayesDriver.class);
+
+ public static final String COMPLEMENTARY = "class"; //b for bayes, c for complementary
+ private static final Pattern SLASH = Pattern.compile("/");
+
+ public static void main(String[] args) throws Exception {
+ ToolRunner.run(new Configuration(), new TestNaiveBayesDriver(), args);
+ }
+
+ @Override
+ public int run(String[] args) throws Exception {
+ addInputOption();
+ addOutputOption();
+ addOption(addOption(DefaultOptionCreator.overwriteOption().create()));
+ addOption("model", "m", "The path to the model built during training", true);
+ addOption(buildOption("testComplementary", "c", "test complementary?", false, false, String.valueOf(false)));
+ addOption(buildOption("runSequential", "seq", "run sequential?", false, false, String.valueOf(false)));
+ addOption("labelIndex", "l", "The path to the location of the label index", true);
+ Map<String, List<String>> parsedArgs = parseArguments(args);
+ if (parsedArgs == null) {
+ return -1;
+ }
+ if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
+ HadoopUtil.delete(getConf(), getOutputPath());
+ }
+
+ boolean sequential = hasOption("runSequential");
+ boolean succeeded;
+ if (sequential) {
+ runSequential();
+ } else {
+ succeeded = runMapReduce();
+ if (!succeeded) {
+ return -1;
+ }
+ }
+
+ //load the labels
+ Map<Integer, String> labelMap = BayesUtils.readLabelIndex(getConf(), new Path(getOption("labelIndex")));
+
+ //loop over the results and create the confusion matrix
+ SequenceFileDirIterable<Text, VectorWritable> dirIterable =
+ new SequenceFileDirIterable<>(getOutputPath(), PathType.LIST, PathFilters.partFilter(), getConf());
+ ResultAnalyzer analyzer = new ResultAnalyzer(labelMap.values(), "DEFAULT");
+ analyzeResults(labelMap, dirIterable, analyzer);
+
+ log.info("{} Results: {}", hasOption("testComplementary") ? "Complementary" : "Standard NB", analyzer);
+ return 0;
+ }
+
+ private void runSequential() throws IOException {
+ boolean complementary = hasOption("testComplementary");
+ FileSystem fs = FileSystem.get(getConf());
+ NaiveBayesModel model = NaiveBayesModel.materialize(new Path(getOption("model")), getConf());
+
+ // Ensure that if we are testing in complementary mode, the model has been
+ // trained complementary. a complementarty model will work for standard classification
+ // a standard model will not work for complementary classification
+ if (complementary){
+ Preconditions.checkArgument((model.isComplemtary()),
+ "Complementary mode in model is different from test mode");
+ }
+
+ AbstractNaiveBayesClassifier classifier;
+ if (complementary) {
+ classifier = new ComplementaryNaiveBayesClassifier(model);
+ } else {
+ classifier = new StandardNaiveBayesClassifier(model);
+ }
+
+ try (SequenceFile.Writer writer =
+ SequenceFile.createWriter(fs, getConf(), new Path(getOutputPath(), "part-r-00000"),
+ Text.class, VectorWritable.class)) {
+ SequenceFileDirIterable<Text, VectorWritable> dirIterable =
+ new SequenceFileDirIterable<>(getInputPath(), PathType.LIST, PathFilters.partFilter(), getConf());
+ // loop through the part-r-* files in getInputPath() and get classification scores for all entries
+ for (Pair<Text, VectorWritable> pair : dirIterable) {
+ writer.append(new Text(SLASH.split(pair.getFirst().toString())[1]),
+ new VectorWritable(classifier.classifyFull(pair.getSecond().get())));
+ }
+ }
+ }
+
+ private boolean runMapReduce() throws IOException,
+ InterruptedException, ClassNotFoundException {
+ Path model = new Path(getOption("model"));
+ HadoopUtil.cacheFiles(model, getConf());
+ //the output key is the expected value, the output value are the scores for all the labels
+ Job testJob = prepareJob(getInputPath(), getOutputPath(), SequenceFileInputFormat.class, BayesTestMapper.class,
+ Text.class, VectorWritable.class, SequenceFileOutputFormat.class);
+ //testJob.getConfiguration().set(LABEL_KEY, getOption("--labels"));
+
+
+ boolean complementary = hasOption("testComplementary");
+ testJob.getConfiguration().set(COMPLEMENTARY, String.valueOf(complementary));
+ return testJob.waitForCompletion(true);
+ }
+
+ private static void analyzeResults(Map<Integer, String> labelMap,
+ SequenceFileDirIterable<Text, VectorWritable> dirIterable,
+ ResultAnalyzer analyzer) {
+ for (Pair<Text, VectorWritable> pair : dirIterable) {
+ int bestIdx = Integer.MIN_VALUE;
+ double bestScore = Long.MIN_VALUE;
+ for (Vector.Element element : pair.getSecond().get().all()) {
+ if (element.get() > bestScore) {
+ bestScore = element.get();
+ bestIdx = element.index();
+ }
+ }
+ if (bestIdx != Integer.MIN_VALUE) {
+ ClassifierResult classifierResult = new ClassifierResult(labelMap.get(bestIdx), bestScore);
+ analyzer.addInstance(pair.getFirst().toString(), classifierResult);
+ }
+ }
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/training/ComplementaryThetaTrainer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/training/ComplementaryThetaTrainer.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/training/ComplementaryThetaTrainer.java
new file mode 100644
index 0000000..2b8ee1e
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/training/ComplementaryThetaTrainer.java
@@ -0,0 +1,83 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.naivebayes.training;
+
+import com.google.common.base.Preconditions;
+import org.apache.mahout.classifier.naivebayes.ComplementaryNaiveBayesClassifier;
+import org.apache.mahout.math.Vector;
+
+public class ComplementaryThetaTrainer {
+
+ private final Vector weightsPerFeature;
+ private final Vector weightsPerLabel;
+ private final Vector perLabelThetaNormalizer;
+ private final double alphaI;
+ private final double totalWeightSum;
+ private final double numFeatures;
+
+ public ComplementaryThetaTrainer(Vector weightsPerFeature, Vector weightsPerLabel, double alphaI) {
+ Preconditions.checkNotNull(weightsPerFeature);
+ Preconditions.checkNotNull(weightsPerLabel);
+ this.weightsPerFeature = weightsPerFeature;
+ this.weightsPerLabel = weightsPerLabel;
+ this.alphaI = alphaI;
+ perLabelThetaNormalizer = weightsPerLabel.like();
+ totalWeightSum = weightsPerLabel.zSum();
+ numFeatures = weightsPerFeature.getNumNondefaultElements();
+ }
+
+ public void train(int label, Vector perLabelWeight) {
+ double labelWeight = labelWeight(label);
+ // sum weights for each label including those with zero word counts
+ for(int i = 0; i < perLabelWeight.size(); i++){
+ Vector.Element perLabelWeightElement = perLabelWeight.getElement(i);
+ updatePerLabelThetaNormalizer(label,
+ ComplementaryNaiveBayesClassifier.computeWeight(featureWeight(perLabelWeightElement.index()),
+ perLabelWeightElement.get(), totalWeightSum(), labelWeight, alphaI(), numFeatures()));
+ }
+ }
+
+ protected double alphaI() {
+ return alphaI;
+ }
+
+ protected double numFeatures() {
+ return numFeatures;
+ }
+
+ protected double labelWeight(int label) {
+ return weightsPerLabel.get(label);
+ }
+
+ protected double totalWeightSum() {
+ return totalWeightSum;
+ }
+
+ protected double featureWeight(int feature) {
+ return weightsPerFeature.get(feature);
+ }
+
+ // http://people.csail.mit.edu/jrennie/papers/icml03-nb.pdf - Section 3.2, Weight Magnitude Errors
+ protected void updatePerLabelThetaNormalizer(int label, double weight) {
+ perLabelThetaNormalizer.set(label, perLabelThetaNormalizer.get(label) + Math.abs(weight));
+ }
+
+ public Vector retrievePerLabelThetaNormalizer() {
+ return perLabelThetaNormalizer.clone();
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/training/IndexInstancesMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/training/IndexInstancesMapper.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/training/IndexInstancesMapper.java
new file mode 100644
index 0000000..4df869e
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/training/IndexInstancesMapper.java
@@ -0,0 +1,53 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.naivebayes.training;
+
+import java.io.IOException;
+import java.util.regex.Pattern;
+
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.classifier.naivebayes.BayesUtils;
+import org.apache.mahout.math.VectorWritable;
+import org.apache.mahout.math.map.OpenObjectIntHashMap;
+
+public class IndexInstancesMapper extends Mapper<Text, VectorWritable, IntWritable, VectorWritable> {
+
+ private static final Pattern SLASH = Pattern.compile("/");
+
+ enum Counter { SKIPPED_INSTANCES }
+
+ private OpenObjectIntHashMap<String> labelIndex;
+
+ @Override
+ protected void setup(Context ctx) throws IOException, InterruptedException {
+ super.setup(ctx);
+ labelIndex = BayesUtils.readIndexFromCache(ctx.getConfiguration());
+ }
+
+ @Override
+ protected void map(Text labelText, VectorWritable instance, Context ctx) throws IOException, InterruptedException {
+ String label = SLASH.split(labelText.toString())[1];
+ if (labelIndex.containsKey(label)) {
+ ctx.write(new IntWritable(labelIndex.get(label)), instance);
+ } else {
+ ctx.getCounter(Counter.SKIPPED_INSTANCES).increment(1);
+ }
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/training/ThetaMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/training/ThetaMapper.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/training/ThetaMapper.java
new file mode 100644
index 0000000..ff2ea40
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/training/ThetaMapper.java
@@ -0,0 +1,61 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.naivebayes.training;
+
+import java.io.IOException;
+import java.util.Map;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.classifier.naivebayes.BayesUtils;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+
+public class ThetaMapper extends Mapper<IntWritable, VectorWritable, Text, VectorWritable> {
+
+ public static final String ALPHA_I = ThetaMapper.class.getName() + ".alphaI";
+ static final String TRAIN_COMPLEMENTARY = ThetaMapper.class.getName() + ".trainComplementary";
+
+ private ComplementaryThetaTrainer trainer;
+
+ @Override
+ protected void setup(Context ctx) throws IOException, InterruptedException {
+ super.setup(ctx);
+ Configuration conf = ctx.getConfiguration();
+
+ float alphaI = conf.getFloat(ALPHA_I, 1.0f);
+ Map<String, Vector> scores = BayesUtils.readScoresFromCache(conf);
+
+ trainer = new ComplementaryThetaTrainer(scores.get(TrainNaiveBayesJob.WEIGHTS_PER_FEATURE),
+ scores.get(TrainNaiveBayesJob.WEIGHTS_PER_LABEL), alphaI);
+ }
+
+ @Override
+ protected void map(IntWritable key, VectorWritable value, Context ctx) throws IOException, InterruptedException {
+ trainer.train(key.get(), value.get());
+ }
+
+ @Override
+ protected void cleanup(Context ctx) throws IOException, InterruptedException {
+ ctx.write(new Text(TrainNaiveBayesJob.LABEL_THETA_NORMALIZER),
+ new VectorWritable(trainer.retrievePerLabelThetaNormalizer()));
+ super.cleanup(ctx);
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/training/TrainNaiveBayesJob.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/training/TrainNaiveBayesJob.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/training/TrainNaiveBayesJob.java
new file mode 100644
index 0000000..cd18d28
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/training/TrainNaiveBayesJob.java
@@ -0,0 +1,177 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.naivebayes.training;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.classifier.naivebayes.BayesUtils;
+import org.apache.mahout.classifier.naivebayes.NaiveBayesModel;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
+import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
+import org.apache.mahout.common.mapreduce.VectorSumReducer;
+import org.apache.mahout.math.VectorWritable;
+
+import com.google.common.base.Splitter;
+
+/** Trains a Naive Bayes Classifier (parameters for both Naive Bayes and Complementary Naive Bayes) */
+public final class TrainNaiveBayesJob extends AbstractJob {
+ private static final String TRAIN_COMPLEMENTARY = "trainComplementary";
+ private static final String ALPHA_I = "alphaI";
+ private static final String LABEL_INDEX = "labelIndex";
+ public static final String WEIGHTS_PER_FEATURE = "__SPF";
+ public static final String WEIGHTS_PER_LABEL = "__SPL";
+ public static final String LABEL_THETA_NORMALIZER = "_LTN";
+ public static final String SUMMED_OBSERVATIONS = "summedObservations";
+ public static final String WEIGHTS = "weights";
+ public static final String THETAS = "thetas";
+
+ public static void main(String[] args) throws Exception {
+ ToolRunner.run(new Configuration(), new TrainNaiveBayesJob(), args);
+ }
+
+ @Override
+ public int run(String[] args) throws Exception {
+
+ addInputOption();
+ addOutputOption();
+
+ addOption(ALPHA_I, "a", "smoothing parameter", String.valueOf(1.0f));
+ addOption(buildOption(TRAIN_COMPLEMENTARY, "c", "train complementary?", false, false, String.valueOf(false)));
+ addOption(LABEL_INDEX, "li", "The path to store the label index in", false);
+ addOption(DefaultOptionCreator.overwriteOption().create());
+
+ Map<String, List<String>> parsedArgs = parseArguments(args);
+ if (parsedArgs == null) {
+ return -1;
+ }
+ if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
+ HadoopUtil.delete(getConf(), getOutputPath());
+ HadoopUtil.delete(getConf(), getTempPath());
+ }
+ Path labPath;
+ String labPathStr = getOption(LABEL_INDEX);
+ if (labPathStr != null) {
+ labPath = new Path(labPathStr);
+ } else {
+ labPath = getTempPath(LABEL_INDEX);
+ }
+ long labelSize = createLabelIndex(labPath);
+ float alphaI = Float.parseFloat(getOption(ALPHA_I));
+ boolean trainComplementary = hasOption(TRAIN_COMPLEMENTARY);
+
+ HadoopUtil.setSerializations(getConf());
+ HadoopUtil.cacheFiles(labPath, getConf());
+
+ // Add up all the vectors with the same labels, while mapping the labels into our index
+ Job indexInstances = prepareJob(getInputPath(),
+ getTempPath(SUMMED_OBSERVATIONS),
+ SequenceFileInputFormat.class,
+ IndexInstancesMapper.class,
+ IntWritable.class,
+ VectorWritable.class,
+ VectorSumReducer.class,
+ IntWritable.class,
+ VectorWritable.class,
+ SequenceFileOutputFormat.class);
+ indexInstances.setCombinerClass(VectorSumReducer.class);
+ boolean succeeded = indexInstances.waitForCompletion(true);
+ if (!succeeded) {
+ return -1;
+ }
+ // Sum up all the weights from the previous step, per label and per feature
+ Job weightSummer = prepareJob(getTempPath(SUMMED_OBSERVATIONS),
+ getTempPath(WEIGHTS),
+ SequenceFileInputFormat.class,
+ WeightsMapper.class,
+ Text.class,
+ VectorWritable.class,
+ VectorSumReducer.class,
+ Text.class,
+ VectorWritable.class,
+ SequenceFileOutputFormat.class);
+ weightSummer.getConfiguration().set(WeightsMapper.NUM_LABELS, String.valueOf(labelSize));
+ weightSummer.setCombinerClass(VectorSumReducer.class);
+ succeeded = weightSummer.waitForCompletion(true);
+ if (!succeeded) {
+ return -1;
+ }
+
+ // Put the per label and per feature vectors into the cache
+ HadoopUtil.cacheFiles(getTempPath(WEIGHTS), getConf());
+
+ if (trainComplementary){
+ // Calculate the per label theta normalizers, write out to LABEL_THETA_NORMALIZER vector
+ // see http://people.csail.mit.edu/jrennie/papers/icml03-nb.pdf - Section 3.2, Weight Magnitude Errors
+ Job thetaSummer = prepareJob(getTempPath(SUMMED_OBSERVATIONS),
+ getTempPath(THETAS),
+ SequenceFileInputFormat.class,
+ ThetaMapper.class,
+ Text.class,
+ VectorWritable.class,
+ VectorSumReducer.class,
+ Text.class,
+ VectorWritable.class,
+ SequenceFileOutputFormat.class);
+ thetaSummer.setCombinerClass(VectorSumReducer.class);
+ thetaSummer.getConfiguration().setFloat(ThetaMapper.ALPHA_I, alphaI);
+ thetaSummer.getConfiguration().setBoolean(ThetaMapper.TRAIN_COMPLEMENTARY, trainComplementary);
+ succeeded = thetaSummer.waitForCompletion(true);
+ if (!succeeded) {
+ return -1;
+ }
+ }
+
+ // Put the per label theta normalizers into the cache
+ HadoopUtil.cacheFiles(getTempPath(THETAS), getConf());
+
+ // Validate our model and then write it out to the official output
+ getConf().setFloat(ThetaMapper.ALPHA_I, alphaI);
+ getConf().setBoolean(NaiveBayesModel.COMPLEMENTARY_MODEL, trainComplementary);
+ NaiveBayesModel naiveBayesModel = BayesUtils.readModelFromDir(getTempPath(), getConf());
+ naiveBayesModel.validate();
+ naiveBayesModel.serialize(getOutputPath(), getConf());
+
+ return 0;
+ }
+
+ private long createLabelIndex(Path labPath) throws IOException {
+ long labelSize = 0;
+ Iterable<Pair<Text,IntWritable>> iterable =
+ new SequenceFileDirIterable<>(getInputPath(),
+ PathType.LIST,
+ PathFilters.logsCRCFilter(),
+ getConf());
+ labelSize = BayesUtils.writeLabelIndex(getConf(), labPath, iterable);
+ return labelSize;
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/training/WeightsMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/training/WeightsMapper.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/training/WeightsMapper.java
new file mode 100644
index 0000000..5563057
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/training/WeightsMapper.java
@@ -0,0 +1,68 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.naivebayes.training;
+
+import java.io.IOException;
+
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.RandomAccessSparseVector;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+import org.apache.mahout.math.function.Functions;
+
+import com.google.common.base.Preconditions;
+
+public class WeightsMapper extends Mapper<IntWritable, VectorWritable, Text, VectorWritable> {
+
+ static final String NUM_LABELS = WeightsMapper.class.getName() + ".numLabels";
+
+ private Vector weightsPerFeature;
+ private Vector weightsPerLabel;
+
+ @Override
+ protected void setup(Context ctx) throws IOException, InterruptedException {
+ super.setup(ctx);
+ int numLabels = Integer.parseInt(ctx.getConfiguration().get(NUM_LABELS));
+ Preconditions.checkArgument(numLabels > 0, "Wrong numLabels: " + numLabels + ". Must be > 0!");
+ weightsPerLabel = new DenseVector(numLabels);
+ }
+
+ @Override
+ protected void map(IntWritable index, VectorWritable value, Context ctx) throws IOException, InterruptedException {
+ Vector instance = value.get();
+ if (weightsPerFeature == null) {
+ weightsPerFeature = new RandomAccessSparseVector(instance.size(), instance.getNumNondefaultElements());
+ }
+
+ int label = index.get();
+ weightsPerFeature.assign(instance, Functions.PLUS);
+ weightsPerLabel.set(label, weightsPerLabel.get(label) + instance.zSum());
+ }
+
+ @Override
+ protected void cleanup(Context ctx) throws IOException, InterruptedException {
+ if (weightsPerFeature != null) {
+ ctx.write(new Text(TrainNaiveBayesJob.WEIGHTS_PER_FEATURE), new VectorWritable(weightsPerFeature));
+ ctx.write(new Text(TrainNaiveBayesJob.WEIGHTS_PER_LABEL), new VectorWritable(weightsPerLabel));
+ }
+ super.cleanup(ctx);
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/BaumWelchTrainer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/BaumWelchTrainer.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/BaumWelchTrainer.java
new file mode 100644
index 0000000..6d4e2b0
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/BaumWelchTrainer.java
@@ -0,0 +1,161 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sequencelearning.hmm;
+
+import java.io.DataOutputStream;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Date;
+import java.util.List;
+import java.util.Scanner;
+
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.OptionException;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.commandline.Parser;
+import org.apache.mahout.common.CommandLineUtil;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
+
+/**
+ * A class for EM training of HMM from console
+ */
+public final class BaumWelchTrainer {
+
+ private BaumWelchTrainer() {
+ }
+
+ public static void main(String[] args) throws IOException {
+ DefaultOptionBuilder optionBuilder = new DefaultOptionBuilder();
+ ArgumentBuilder argumentBuilder = new ArgumentBuilder();
+
+ Option inputOption = DefaultOptionCreator.inputOption().create();
+
+ Option outputOption = DefaultOptionCreator.outputOption().create();
+
+ Option stateNumberOption = optionBuilder.withLongName("nrOfHiddenStates").
+ withDescription("Number of hidden states").
+ withShortName("nh").withArgument(argumentBuilder.withMaximum(1).withMinimum(1).
+ withName("number").create()).withRequired(true).create();
+
+ Option observedStateNumberOption = optionBuilder.withLongName("nrOfObservedStates").
+ withDescription("Number of observed states").
+ withShortName("no").withArgument(argumentBuilder.withMaximum(1).withMinimum(1).
+ withName("number").create()).withRequired(true).create();
+
+ Option epsilonOption = optionBuilder.withLongName("epsilon").
+ withDescription("Convergence threshold").
+ withShortName("e").withArgument(argumentBuilder.withMaximum(1).withMinimum(1).
+ withName("number").create()).withRequired(true).create();
+
+ Option iterationsOption = optionBuilder.withLongName("max-iterations").
+ withDescription("Maximum iterations number").
+ withShortName("m").withArgument(argumentBuilder.withMaximum(1).withMinimum(1).
+ withName("number").create()).withRequired(true).create();
+
+ Group optionGroup = new GroupBuilder().withOption(inputOption).
+ withOption(outputOption).withOption(stateNumberOption).withOption(observedStateNumberOption).
+ withOption(epsilonOption).withOption(iterationsOption).
+ withName("Options").create();
+
+ try {
+ Parser parser = new Parser();
+ parser.setGroup(optionGroup);
+ CommandLine commandLine = parser.parse(args);
+
+ String input = (String) commandLine.getValue(inputOption);
+ String output = (String) commandLine.getValue(outputOption);
+
+ int nrOfHiddenStates = Integer.parseInt((String) commandLine.getValue(stateNumberOption));
+ int nrOfObservedStates = Integer.parseInt((String) commandLine.getValue(observedStateNumberOption));
+
+ double epsilon = Double.parseDouble((String) commandLine.getValue(epsilonOption));
+ int maxIterations = Integer.parseInt((String) commandLine.getValue(iterationsOption));
+
+ //constructing random-generated HMM
+ HmmModel model = new HmmModel(nrOfHiddenStates, nrOfObservedStates, new Date().getTime());
+ List<Integer> observations = new ArrayList<>();
+
+ //reading observations
+ try (Scanner scanner = new Scanner(new FileInputStream(input), "UTF-8")) {
+ while (scanner.hasNextInt()) {
+ observations.add(scanner.nextInt());
+ }
+ }
+
+ int[] observationsArray = new int[observations.size()];
+ for (int i = 0; i < observations.size(); ++i) {
+ observationsArray[i] = observations.get(i);
+ }
+
+ //training
+ HmmModel trainedModel = HmmTrainer.trainBaumWelch(model,
+ observationsArray, epsilon, maxIterations, true);
+
+ //serializing trained model
+ try (DataOutputStream stream = new DataOutputStream(new FileOutputStream(output))){
+ LossyHmmSerializer.serialize(trainedModel, stream);
+ }
+
+ //printing tranied model
+ System.out.println("Initial probabilities: ");
+ for (int i = 0; i < trainedModel.getNrOfHiddenStates(); ++i) {
+ System.out.print(i + " ");
+ }
+ System.out.println();
+ for (int i = 0; i < trainedModel.getNrOfHiddenStates(); ++i) {
+ System.out.print(trainedModel.getInitialProbabilities().get(i) + " ");
+ }
+ System.out.println();
+
+ System.out.println("Transition matrix:");
+ System.out.print(" ");
+ for (int i = 0; i < trainedModel.getNrOfHiddenStates(); ++i) {
+ System.out.print(i + " ");
+ }
+ System.out.println();
+ for (int i = 0; i < trainedModel.getNrOfHiddenStates(); ++i) {
+ System.out.print(i + " ");
+ for (int j = 0; j < trainedModel.getNrOfHiddenStates(); ++j) {
+ System.out.print(trainedModel.getTransitionMatrix().get(i, j) + " ");
+ }
+ System.out.println();
+ }
+ System.out.println("Emission matrix: ");
+ System.out.print(" ");
+ for (int i = 0; i < trainedModel.getNrOfOutputStates(); ++i) {
+ System.out.print(i + " ");
+ }
+ System.out.println();
+ for (int i = 0; i < trainedModel.getNrOfHiddenStates(); ++i) {
+ System.out.print(i + " ");
+ for (int j = 0; j < trainedModel.getNrOfOutputStates(); ++j) {
+ System.out.print(trainedModel.getEmissionMatrix().get(i, j) + " ");
+ }
+ System.out.println();
+ }
+ } catch (OptionException e) {
+ CommandLineUtil.printHelp(optionGroup);
+ }
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/HmmAlgorithms.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/HmmAlgorithms.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/HmmAlgorithms.java
new file mode 100644
index 0000000..c1d328e
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/HmmAlgorithms.java
@@ -0,0 +1,306 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sequencelearning.hmm;
+
+import org.apache.mahout.math.DenseMatrix;
+import org.apache.mahout.math.Matrix;
+import org.apache.mahout.math.Vector;
+
+/**
+ * Class containing implementations of the three major HMM algorithms: forward,
+ * backward and Viterbi
+ */
+public final class HmmAlgorithms {
+
+
+ /**
+ * No public constructors for utility classes.
+ */
+ private HmmAlgorithms() {
+ // nothing to do here really
+ }
+
+ /**
+ * External function to compute a matrix of alpha factors
+ *
+ * @param model model to run forward algorithm for.
+ * @param observations observation sequence to train on.
+ * @param scaled Should log-scaled beta factors be computed?
+ * @return matrix of alpha factors.
+ */
+ public static Matrix forwardAlgorithm(HmmModel model, int[] observations, boolean scaled) {
+ Matrix alpha = new DenseMatrix(observations.length, model.getNrOfHiddenStates());
+ forwardAlgorithm(alpha, model, observations, scaled);
+
+ return alpha;
+ }
+
+ /**
+ * Internal function to compute the alpha factors
+ *
+ * @param alpha matrix to store alpha factors in.
+ * @param model model to use for alpha factor computation.
+ * @param observations observation sequence seen.
+ * @param scaled set to true if log-scaled beta factors should be computed.
+ */
+ static void forwardAlgorithm(Matrix alpha, HmmModel model, int[] observations, boolean scaled) {
+
+ // fetch references to the model parameters
+ Vector ip = model.getInitialProbabilities();
+ Matrix b = model.getEmissionMatrix();
+ Matrix a = model.getTransitionMatrix();
+
+ if (scaled) { // compute log scaled alpha values
+ // Initialization
+ for (int i = 0; i < model.getNrOfHiddenStates(); i++) {
+ alpha.setQuick(0, i, Math.log(ip.getQuick(i) * b.getQuick(i, observations[0])));
+ }
+
+ // Induction
+ for (int t = 1; t < observations.length; t++) {
+ for (int i = 0; i < model.getNrOfHiddenStates(); i++) {
+ double sum = Double.NEGATIVE_INFINITY; // log(0)
+ for (int j = 0; j < model.getNrOfHiddenStates(); j++) {
+ double tmp = alpha.getQuick(t - 1, j) + Math.log(a.getQuick(j, i));
+ if (tmp > Double.NEGATIVE_INFINITY) {
+ // make sure we handle log(0) correctly
+ sum = tmp + Math.log1p(Math.exp(sum - tmp));
+ }
+ }
+ alpha.setQuick(t, i, sum + Math.log(b.getQuick(i, observations[t])));
+ }
+ }
+ } else {
+
+ // Initialization
+ for (int i = 0; i < model.getNrOfHiddenStates(); i++) {
+ alpha.setQuick(0, i, ip.getQuick(i) * b.getQuick(i, observations[0]));
+ }
+
+ // Induction
+ for (int t = 1; t < observations.length; t++) {
+ for (int i = 0; i < model.getNrOfHiddenStates(); i++) {
+ double sum = 0.0;
+ for (int j = 0; j < model.getNrOfHiddenStates(); j++) {
+ sum += alpha.getQuick(t - 1, j) * a.getQuick(j, i);
+ }
+ alpha.setQuick(t, i, sum * b.getQuick(i, observations[t]));
+ }
+ }
+ }
+ }
+
+ /**
+ * External function to compute a matrix of beta factors
+ *
+ * @param model model to use for estimation.
+ * @param observations observation sequence seen.
+ * @param scaled Set to true if log-scaled beta factors should be computed.
+ * @return beta factors based on the model and observation sequence.
+ */
+ public static Matrix backwardAlgorithm(HmmModel model, int[] observations, boolean scaled) {
+ // initialize the matrix
+ Matrix beta = new DenseMatrix(observations.length, model.getNrOfHiddenStates());
+ // compute the beta factors
+ backwardAlgorithm(beta, model, observations, scaled);
+
+ return beta;
+ }
+
+ /**
+ * Internal function to compute the beta factors
+ *
+ * @param beta Matrix to store resulting factors in.
+ * @param model model to use for factor estimation.
+ * @param observations sequence of observations to estimate.
+ * @param scaled set to true to compute log-scaled parameters.
+ */
+ static void backwardAlgorithm(Matrix beta, HmmModel model, int[] observations, boolean scaled) {
+ // fetch references to the model parameters
+ Matrix b = model.getEmissionMatrix();
+ Matrix a = model.getTransitionMatrix();
+
+ if (scaled) { // compute log-scaled factors
+ // initialization
+ for (int i = 0; i < model.getNrOfHiddenStates(); i++) {
+ beta.setQuick(observations.length - 1, i, 0);
+ }
+
+ // induction
+ for (int t = observations.length - 2; t >= 0; t--) {
+ for (int i = 0; i < model.getNrOfHiddenStates(); i++) {
+ double sum = Double.NEGATIVE_INFINITY; // log(0)
+ for (int j = 0; j < model.getNrOfHiddenStates(); j++) {
+ double tmp = beta.getQuick(t + 1, j) + Math.log(a.getQuick(i, j))
+ + Math.log(b.getQuick(j, observations[t + 1]));
+ if (tmp > Double.NEGATIVE_INFINITY) {
+ // handle log(0)
+ sum = tmp + Math.log1p(Math.exp(sum - tmp));
+ }
+ }
+ beta.setQuick(t, i, sum);
+ }
+ }
+ } else {
+ // initialization
+ for (int i = 0; i < model.getNrOfHiddenStates(); i++) {
+ beta.setQuick(observations.length - 1, i, 1);
+ }
+ // induction
+ for (int t = observations.length - 2; t >= 0; t--) {
+ for (int i = 0; i < model.getNrOfHiddenStates(); i++) {
+ double sum = 0;
+ for (int j = 0; j < model.getNrOfHiddenStates(); j++) {
+ sum += beta.getQuick(t + 1, j) * a.getQuick(i, j) * b.getQuick(j, observations[t + 1]);
+ }
+ beta.setQuick(t, i, sum);
+ }
+ }
+ }
+ }
+
+ /**
+ * Viterbi algorithm to compute the most likely hidden sequence for a given
+ * model and observed sequence
+ *
+ * @param model HmmModel for which the Viterbi path should be computed
+ * @param observations Sequence of observations
+ * @param scaled Use log-scaled computations, this requires higher computational
+ * effort but is numerically more stable for large observation
+ * sequences
+ * @return nrOfObservations 1D int array containing the most likely hidden
+ * sequence
+ */
+ public static int[] viterbiAlgorithm(HmmModel model, int[] observations, boolean scaled) {
+
+ // probability that the most probable hidden states ends at state i at
+ // time t
+ double[][] delta = new double[observations.length][model
+ .getNrOfHiddenStates()];
+
+ // previous hidden state in the most probable state leading up to state
+ // i at time t
+ int[][] phi = new int[observations.length - 1][model.getNrOfHiddenStates()];
+
+ // initialize the return array
+ int[] sequence = new int[observations.length];
+
+ viterbiAlgorithm(sequence, delta, phi, model, observations, scaled);
+
+ return sequence;
+ }
+
+ /**
+ * Internal version of the viterbi algorithm, allowing to reuse existing
+ * arrays instead of allocating new ones
+ *
+ * @param sequence NrOfObservations 1D int array for storing the viterbi sequence
+ * @param delta NrOfObservations x NrHiddenStates 2D double array for storing the
+ * delta factors
+ * @param phi NrOfObservations-1 x NrHiddenStates 2D int array for storing the
+ * phi values
+ * @param model HmmModel for which the viterbi path should be computed
+ * @param observations Sequence of observations
+ * @param scaled Use log-scaled computations, this requires higher computational
+ * effort but is numerically more stable for large observation
+ * sequences
+ */
+ static void viterbiAlgorithm(int[] sequence, double[][] delta, int[][] phi, HmmModel model, int[] observations,
+ boolean scaled) {
+ // fetch references to the model parameters
+ Vector ip = model.getInitialProbabilities();
+ Matrix b = model.getEmissionMatrix();
+ Matrix a = model.getTransitionMatrix();
+
+ // Initialization
+ if (scaled) {
+ for (int i = 0; i < model.getNrOfHiddenStates(); i++) {
+ delta[0][i] = Math.log(ip.getQuick(i) * b.getQuick(i, observations[0]));
+ }
+ } else {
+
+ for (int i = 0; i < model.getNrOfHiddenStates(); i++) {
+ delta[0][i] = ip.getQuick(i) * b.getQuick(i, observations[0]);
+ }
+ }
+
+ // Induction
+ // iterate over the time
+ if (scaled) {
+ for (int t = 1; t < observations.length; t++) {
+ // iterate over the hidden states
+ for (int i = 0; i < model.getNrOfHiddenStates(); i++) {
+ // find the maximum probability and most likely state
+ // leading up
+ // to this
+ int maxState = 0;
+ double maxProb = delta[t - 1][0] + Math.log(a.getQuick(0, i));
+ for (int j = 1; j < model.getNrOfHiddenStates(); j++) {
+ double prob = delta[t - 1][j] + Math.log(a.getQuick(j, i));
+ if (prob > maxProb) {
+ maxProb = prob;
+ maxState = j;
+ }
+ }
+ delta[t][i] = maxProb + Math.log(b.getQuick(i, observations[t]));
+ phi[t - 1][i] = maxState;
+ }
+ }
+ } else {
+ for (int t = 1; t < observations.length; t++) {
+ // iterate over the hidden states
+ for (int i = 0; i < model.getNrOfHiddenStates(); i++) {
+ // find the maximum probability and most likely state
+ // leading up
+ // to this
+ int maxState = 0;
+ double maxProb = delta[t - 1][0] * a.getQuick(0, i);
+ for (int j = 1; j < model.getNrOfHiddenStates(); j++) {
+ double prob = delta[t - 1][j] * a.getQuick(j, i);
+ if (prob > maxProb) {
+ maxProb = prob;
+ maxState = j;
+ }
+ }
+ delta[t][i] = maxProb * b.getQuick(i, observations[t]);
+ phi[t - 1][i] = maxState;
+ }
+ }
+ }
+
+ // find the most likely end state for initialization
+ double maxProb;
+ if (scaled) {
+ maxProb = Double.NEGATIVE_INFINITY;
+ } else {
+ maxProb = 0.0;
+ }
+ for (int i = 0; i < model.getNrOfHiddenStates(); i++) {
+ if (delta[observations.length - 1][i] > maxProb) {
+ maxProb = delta[observations.length - 1][i];
+ sequence[observations.length - 1] = i;
+ }
+ }
+
+ // now backtrack to find the most likely hidden sequence
+ for (int t = observations.length - 2; t >= 0; t--) {
+ sequence[t] = phi[t][sequence[t + 1]];
+ }
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/HmmEvaluator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/HmmEvaluator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/HmmEvaluator.java
new file mode 100644
index 0000000..6e2def6
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/HmmEvaluator.java
@@ -0,0 +1,194 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sequencelearning.hmm;
+
+import java.util.Random;
+
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.math.Matrix;
+import org.apache.mahout.math.Vector;
+
+/**
+ * The HMMEvaluator class offers several methods to evaluate an HMM Model. The
+ * following use-cases are covered: 1) Generate a sequence of output states from
+ * a given model (prediction). 2) Compute the likelihood that a given model
+ * generated a given sequence of output states (model likelihood). 3) Compute
+ * the most likely hidden sequence for a given model and a given observed
+ * sequence (decoding).
+ */
+public final class HmmEvaluator {
+
+ /**
+ * No constructor for utility classes.
+ */
+ private HmmEvaluator() {}
+
+ /**
+ * Predict a sequence of steps output states for the given HMM model
+ *
+ * @param model The Hidden Markov model used to generate the output sequence
+ * @param steps Size of the generated output sequence
+ * @return integer array containing a sequence of steps output state IDs,
+ * generated by the specified model
+ */
+ public static int[] predict(HmmModel model, int steps) {
+ return predict(model, steps, RandomUtils.getRandom());
+ }
+
+ /**
+ * Predict a sequence of steps output states for the given HMM model
+ *
+ * @param model The Hidden Markov model used to generate the output sequence
+ * @param steps Size of the generated output sequence
+ * @param seed seed to use for the RNG
+ * @return integer array containing a sequence of steps output state IDs,
+ * generated by the specified model
+ */
+ public static int[] predict(HmmModel model, int steps, long seed) {
+ return predict(model, steps, RandomUtils.getRandom(seed));
+ }
+ /**
+ * Predict a sequence of steps output states for the given HMM model using the
+ * given seed for probabilistic experiments
+ *
+ * @param model The Hidden Markov model used to generate the output sequence
+ * @param steps Size of the generated output sequence
+ * @param rand RNG to use
+ * @return integer array containing a sequence of steps output state IDs,
+ * generated by the specified model
+ */
+ private static int[] predict(HmmModel model, int steps, Random rand) {
+ // fetch the cumulative distributions
+ Vector cip = HmmUtils.getCumulativeInitialProbabilities(model);
+ Matrix ctm = HmmUtils.getCumulativeTransitionMatrix(model);
+ Matrix com = HmmUtils.getCumulativeOutputMatrix(model);
+ // allocate the result IntArrayList
+ int[] result = new int[steps];
+ // choose the initial state
+ int hiddenState = 0;
+
+ double randnr = rand.nextDouble();
+ while (cip.get(hiddenState) < randnr) {
+ hiddenState++;
+ }
+
+ // now draw steps output states according to the cumulative
+ // distributions
+ for (int step = 0; step < steps; ++step) {
+ // choose output state to given hidden state
+ randnr = rand.nextDouble();
+ int outputState = 0;
+ while (com.get(hiddenState, outputState) < randnr) {
+ outputState++;
+ }
+ result[step] = outputState;
+ // choose the next hidden state
+ randnr = rand.nextDouble();
+ int nextHiddenState = 0;
+ while (ctm.get(hiddenState, nextHiddenState) < randnr) {
+ nextHiddenState++;
+ }
+ hiddenState = nextHiddenState;
+ }
+ return result;
+ }
+
+ /**
+ * Returns the likelihood that a given output sequence was produced by the
+ * given model. Internally, this function calls the forward algorithm to
+ * compute the alpha values and then uses the overloaded function to compute
+ * the actual model likelihood.
+ *
+ * @param model Model to base the likelihood on.
+ * @param outputSequence Sequence to compute likelihood for.
+ * @param scaled Use log-scaled parameters for computation. This is computationally
+ * more expensive, but offers better numerically stability in case of
+ * long output sequences
+ * @return Likelihood that the given model produced the given sequence
+ */
+ public static double modelLikelihood(HmmModel model, int[] outputSequence, boolean scaled) {
+ return modelLikelihood(HmmAlgorithms.forwardAlgorithm(model, outputSequence, scaled), scaled);
+ }
+
+ /**
+ * Computes the likelihood that a given output sequence was computed by a
+ * given model using the alpha values computed by the forward algorithm.
+ * // TODO I am a bit confused here - where is the output sequence referenced in the comment above in the code?
+ * @param alpha Matrix of alpha values
+ * @param scaled Set to true if the alpha values are log-scaled.
+ * @return model likelihood.
+ */
+ public static double modelLikelihood(Matrix alpha, boolean scaled) {
+ double likelihood = 0;
+ if (scaled) {
+ for (int i = 0; i < alpha.numCols(); ++i) {
+ likelihood += Math.exp(alpha.getQuick(alpha.numRows() - 1, i));
+ }
+ } else {
+ for (int i = 0; i < alpha.numCols(); ++i) {
+ likelihood += alpha.getQuick(alpha.numRows() - 1, i);
+ }
+ }
+ return likelihood;
+ }
+
+ /**
+ * Computes the likelihood that a given output sequence was computed by a
+ * given model.
+ *
+ * @param model model to compute sequence likelihood for.
+ * @param outputSequence sequence to base computation on.
+ * @param beta beta parameters.
+ * @param scaled set to true if betas are log-scaled.
+ * @return likelihood of the outputSequence given the model.
+ */
+ public static double modelLikelihood(HmmModel model, int[] outputSequence, Matrix beta, boolean scaled) {
+ double likelihood = 0;
+ // fetch the emission probabilities
+ Matrix e = model.getEmissionMatrix();
+ Vector pi = model.getInitialProbabilities();
+ int firstOutput = outputSequence[0];
+ if (scaled) {
+ for (int i = 0; i < model.getNrOfHiddenStates(); ++i) {
+ likelihood += pi.getQuick(i) * Math.exp(beta.getQuick(0, i)) * e.getQuick(i, firstOutput);
+ }
+ } else {
+ for (int i = 0; i < model.getNrOfHiddenStates(); ++i) {
+ likelihood += pi.getQuick(i) * beta.getQuick(0, i) * e.getQuick(i, firstOutput);
+ }
+ }
+ return likelihood;
+ }
+
+ /**
+ * Returns the most likely sequence of hidden states for the given model and
+ * observation
+ *
+ * @param model model to use for decoding.
+ * @param observations integer Array containing a sequence of observed state IDs
+ * @param scaled Use log-scaled computations, this requires higher computational
+ * effort but is numerically more stable for large observation
+ * sequences
+ * @return integer array containing the most likely sequence of hidden state
+ * IDs
+ */
+ public static int[] decode(HmmModel model, int[] observations, boolean scaled) {
+ return HmmAlgorithms.viterbiAlgorithm(model, observations, scaled);
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/HmmModel.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/HmmModel.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/HmmModel.java
new file mode 100644
index 0000000..bc24884
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/HmmModel.java
@@ -0,0 +1,383 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sequencelearning.hmm;
+
+import java.util.Map;
+import java.util.Random;
+
+import com.google.common.collect.BiMap;
+import com.google.common.collect.HashBiMap;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.math.DenseMatrix;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.Matrix;
+import org.apache.mahout.math.Vector;
+
+/**
+ * Main class defining a Hidden Markov Model
+ */
+public class HmmModel implements Cloneable {
+
+ /** Bi-directional Map for storing the observed state names */
+ private BiMap<String,Integer> outputStateNames;
+
+ /** Bi-Directional Map for storing the hidden state names */
+ private BiMap<String,Integer> hiddenStateNames;
+
+ /* Number of hidden states */
+ private int nrOfHiddenStates;
+
+ /** Number of output states */
+ private int nrOfOutputStates;
+
+ /**
+ * Transition matrix containing the transition probabilities between hidden
+ * states. TransitionMatrix(i,j) is the probability that we change from hidden
+ * state i to hidden state j In general: P(h(t+1)=h_j | h(t) = h_i) =
+ * transitionMatrix(i,j) Since we have to make sure that each hidden state can
+ * be "left", the following normalization condition has to hold:
+ * sum(transitionMatrix(i,j),j=1..hiddenStates) = 1
+ */
+ private Matrix transitionMatrix;
+
+ /**
+ * Output matrix containing the probabilities that we observe a given output
+ * state given a hidden state. outputMatrix(i,j) is the probability that we
+ * observe output state j if we are in hidden state i Formally: P(o(t)=o_j |
+ * h(t)=h_i) = outputMatrix(i,j) Since we always have an observation for each
+ * hidden state, the following normalization condition has to hold:
+ * sum(outputMatrix(i,j),j=1..outputStates) = 1
+ */
+ private Matrix emissionMatrix;
+
+ /**
+ * Vector containing the initial hidden state probabilities. That is
+ * P(h(0)=h_i) = initialProbabilities(i). Since we are dealing with
+ * probabilities the following normalization condition has to hold:
+ * sum(initialProbabilities(i),i=1..hiddenStates) = 1
+ */
+ private Vector initialProbabilities;
+
+
+ /**
+ * Get a copy of this model
+ */
+ @Override
+ public HmmModel clone() {
+ HmmModel model = new HmmModel(transitionMatrix.clone(), emissionMatrix.clone(), initialProbabilities.clone());
+ if (hiddenStateNames != null) {
+ model.hiddenStateNames = HashBiMap.create(hiddenStateNames);
+ }
+ if (outputStateNames != null) {
+ model.outputStateNames = HashBiMap.create(outputStateNames);
+ }
+ return model;
+ }
+
+ /**
+ * Assign the content of another HMM model to this one
+ *
+ * @param model The HmmModel that will be assigned to this one
+ */
+ public void assign(HmmModel model) {
+ this.nrOfHiddenStates = model.nrOfHiddenStates;
+ this.nrOfOutputStates = model.nrOfOutputStates;
+ this.hiddenStateNames = model.hiddenStateNames;
+ this.outputStateNames = model.outputStateNames;
+ // for now clone the matrix/vectors
+ this.initialProbabilities = model.initialProbabilities.clone();
+ this.emissionMatrix = model.emissionMatrix.clone();
+ this.transitionMatrix = model.transitionMatrix.clone();
+ }
+
+ /**
+ * Construct a valid random Hidden-Markov parameter set with the given number
+ * of hidden and output states using a given seed.
+ *
+ * @param nrOfHiddenStates Number of hidden states
+ * @param nrOfOutputStates Number of output states
+ * @param seed Seed for the random initialization, if set to 0 the current time
+ * is used
+ */
+ public HmmModel(int nrOfHiddenStates, int nrOfOutputStates, long seed) {
+ this.nrOfHiddenStates = nrOfHiddenStates;
+ this.nrOfOutputStates = nrOfOutputStates;
+ this.transitionMatrix = new DenseMatrix(nrOfHiddenStates, nrOfHiddenStates);
+ this.emissionMatrix = new DenseMatrix(nrOfHiddenStates, nrOfOutputStates);
+ this.initialProbabilities = new DenseVector(nrOfHiddenStates);
+ // initialize a random, valid parameter set
+ initRandomParameters(seed);
+ }
+
+ /**
+ * Construct a valid random Hidden-Markov parameter set with the given number
+ * of hidden and output states.
+ *
+ * @param nrOfHiddenStates Number of hidden states
+ * @param nrOfOutputStates Number of output states
+ */
+ public HmmModel(int nrOfHiddenStates, int nrOfOutputStates) {
+ this(nrOfHiddenStates, nrOfOutputStates, 0);
+ }
+
+ /**
+ * Generates a Hidden Markov model using the specified parameters
+ *
+ * @param transitionMatrix transition probabilities.
+ * @param emissionMatrix emission probabilities.
+ * @param initialProbabilities initial start probabilities.
+ * @throws IllegalArgumentException If the given parameter set is invalid
+ */
+ public HmmModel(Matrix transitionMatrix, Matrix emissionMatrix, Vector initialProbabilities) {
+ this.nrOfHiddenStates = initialProbabilities.size();
+ this.nrOfOutputStates = emissionMatrix.numCols();
+ this.transitionMatrix = transitionMatrix;
+ this.emissionMatrix = emissionMatrix;
+ this.initialProbabilities = initialProbabilities;
+ }
+
+ /**
+ * Initialize a valid random set of HMM parameters
+ *
+ * @param seed seed to use for Random initialization. Use 0 to use Java-built-in-version.
+ */
+ private void initRandomParameters(long seed) {
+ Random rand;
+ // initialize the random number generator
+ if (seed == 0) {
+ rand = RandomUtils.getRandom();
+ } else {
+ rand = RandomUtils.getRandom(seed);
+ }
+ // initialize the initial Probabilities
+ double sum = 0; // used for normalization
+ for (int i = 0; i < nrOfHiddenStates; i++) {
+ double nextRand = rand.nextDouble();
+ initialProbabilities.set(i, nextRand);
+ sum += nextRand;
+ }
+ // "normalize" the vector to generate probabilities
+ initialProbabilities = initialProbabilities.divide(sum);
+
+ // initialize the transition matrix
+ double[] values = new double[nrOfHiddenStates];
+ for (int i = 0; i < nrOfHiddenStates; i++) {
+ sum = 0;
+ for (int j = 0; j < nrOfHiddenStates; j++) {
+ values[j] = rand.nextDouble();
+ sum += values[j];
+ }
+ // normalize the random values to obtain probabilities
+ for (int j = 0; j < nrOfHiddenStates; j++) {
+ values[j] /= sum;
+ }
+ // set this row of the transition matrix
+ transitionMatrix.set(i, values);
+ }
+
+ // initialize the output matrix
+ values = new double[nrOfOutputStates];
+ for (int i = 0; i < nrOfHiddenStates; i++) {
+ sum = 0;
+ for (int j = 0; j < nrOfOutputStates; j++) {
+ values[j] = rand.nextDouble();
+ sum += values[j];
+ }
+ // normalize the random values to obtain probabilities
+ for (int j = 0; j < nrOfOutputStates; j++) {
+ values[j] /= sum;
+ }
+ // set this row of the output matrix
+ emissionMatrix.set(i, values);
+ }
+ }
+
+ /**
+ * Getter Method for the number of hidden states
+ *
+ * @return Number of hidden states
+ */
+ public int getNrOfHiddenStates() {
+ return nrOfHiddenStates;
+ }
+
+ /**
+ * Getter Method for the number of output states
+ *
+ * @return Number of output states
+ */
+ public int getNrOfOutputStates() {
+ return nrOfOutputStates;
+ }
+
+ /**
+ * Getter function to get the hidden state transition matrix
+ *
+ * @return returns the model's transition matrix.
+ */
+ public Matrix getTransitionMatrix() {
+ return transitionMatrix;
+ }
+
+ /**
+ * Getter function to get the output state probability matrix
+ *
+ * @return returns the models emission matrix.
+ */
+ public Matrix getEmissionMatrix() {
+ return emissionMatrix;
+ }
+
+ /**
+ * Getter function to return the vector of initial hidden state probabilities
+ *
+ * @return returns the model's init probabilities.
+ */
+ public Vector getInitialProbabilities() {
+ return initialProbabilities;
+ }
+
+ /**
+ * Getter method for the hidden state Names map
+ *
+ * @return hidden state names.
+ */
+ public Map<String, Integer> getHiddenStateNames() {
+ return hiddenStateNames;
+ }
+
+ /**
+ * Register an array of hidden state Names. We assume that the state name at
+ * position i has the ID i
+ *
+ * @param stateNames names of hidden states.
+ */
+ public void registerHiddenStateNames(String[] stateNames) {
+ if (stateNames != null) {
+ hiddenStateNames = HashBiMap.create();
+ for (int i = 0; i < stateNames.length; ++i) {
+ hiddenStateNames.put(stateNames[i], i);
+ }
+ }
+ }
+
+ /**
+ * Register a map of hidden state Names/state IDs
+ *
+ * @param stateNames <String,Integer> Map that assigns each state name an integer ID
+ */
+ public void registerHiddenStateNames(Map<String, Integer> stateNames) {
+ if (stateNames != null) {
+ hiddenStateNames = HashBiMap.create(stateNames);
+ }
+ }
+
+ /**
+ * Lookup the name for the given hidden state ID
+ *
+ * @param id Integer id of the hidden state
+ * @return String containing the name for the given ID, null if this ID is not
+ * known or no hidden state names were specified
+ */
+ public String getHiddenStateName(int id) {
+ if (hiddenStateNames == null) {
+ return null;
+ }
+ return hiddenStateNames.inverse().get(id);
+ }
+
+ /**
+ * Lookup the ID for the given hidden state name
+ *
+ * @param name Name of the hidden state
+ * @return int containing the ID for the given name, -1 if this name is not
+ * known or no hidden state names were specified
+ */
+ public int getHiddenStateID(String name) {
+ if (hiddenStateNames == null) {
+ return -1;
+ }
+ Integer tmp = hiddenStateNames.get(name);
+ return tmp == null ? -1 : tmp;
+ }
+
+ /**
+ * Getter method for the output state Names map
+ *
+ * @return names of output states.
+ */
+ public Map<String, Integer> getOutputStateNames() {
+ return outputStateNames;
+ }
+
+ /**
+ * Register an array of hidden state Names. We assume that the state name at
+ * position i has the ID i
+ *
+ * @param stateNames state names to register.
+ */
+ public void registerOutputStateNames(String[] stateNames) {
+ if (stateNames != null) {
+ outputStateNames = HashBiMap.create();
+ for (int i = 0; i < stateNames.length; ++i) {
+ outputStateNames.put(stateNames[i], i);
+ }
+ }
+ }
+
+ /**
+ * Register a map of hidden state Names/state IDs
+ *
+ * @param stateNames <String,Integer> Map that assigns each state name an integer ID
+ */
+ public void registerOutputStateNames(Map<String, Integer> stateNames) {
+ if (stateNames != null) {
+ outputStateNames = HashBiMap.create(stateNames);
+ }
+ }
+
+ /**
+ * Lookup the name for the given output state id
+ *
+ * @param id Integer id of the output state
+ * @return String containing the name for the given id, null if this id is not
+ * known or no output state names were specified
+ */
+ public String getOutputStateName(int id) {
+ if (outputStateNames == null) {
+ return null;
+ }
+ return outputStateNames.inverse().get(id);
+ }
+
+ /**
+ * Lookup the ID for the given output state name
+ *
+ * @param name Name of the output state
+ * @return int containing the ID for the given name, -1 if this name is not
+ * known or no output state names were specified
+ */
+ public int getOutputStateID(String name) {
+ if (outputStateNames == null) {
+ return -1;
+ }
+ Integer tmp = outputStateNames.get(name);
+ return tmp == null ? -1 : tmp;
+ }
+
+}
r***@apache.org
2018-06-28 14:54:33 UTC
Permalink
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/UnitVectorizerJob.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/UnitVectorizerJob.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/UnitVectorizerJob.java
new file mode 100644
index 0000000..56cb237
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/UnitVectorizerJob.java
@@ -0,0 +1,79 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.spectral;
+
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.mahout.math.VectorWritable;
+
+/**
+ * <p>Given a DistributedRowMatrix, this job normalizes each row to unit
+ * vector length. If the input is a matrix U, and the output is a matrix
+ * W, the job follows:</p>
+ *
+ * <p>{@code v_ij = u_ij / sqrt(sum_j(u_ij * u_ij))}</p>
+ */
+public final class UnitVectorizerJob {
+
+ private UnitVectorizerJob() {
+ }
+
+ public static void runJob(Path input, Path output)
+ throws IOException, InterruptedException, ClassNotFoundException {
+
+ Configuration conf = new Configuration();
+ Job job = new Job(conf, "UnitVectorizerJob");
+
+ job.setInputFormatClass(SequenceFileInputFormat.class);
+ job.setOutputKeyClass(IntWritable.class);
+ job.setOutputValueClass(VectorWritable.class);
+ job.setOutputFormatClass(SequenceFileOutputFormat.class);
+ job.setMapperClass(UnitVectorizerMapper.class);
+ job.setNumReduceTasks(0);
+
+ FileInputFormat.addInputPath(job, input);
+ FileOutputFormat.setOutputPath(job, output);
+
+ job.setJarByClass(UnitVectorizerJob.class);
+
+ boolean succeeded = job.waitForCompletion(true);
+ if (!succeeded) {
+ throw new IllegalStateException("Job failed!");
+ }
+ }
+
+ public static class UnitVectorizerMapper
+ extends Mapper<IntWritable, VectorWritable, IntWritable, VectorWritable> {
+
+ @Override
+ protected void map(IntWritable row, VectorWritable vector, Context context)
+ throws IOException, InterruptedException {
+ context.write(row, new VectorWritable(vector.get().normalize(2)));
+ }
+
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/VectorCache.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/VectorCache.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/VectorCache.java
new file mode 100644
index 0000000..4ec8149
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/VectorCache.java
@@ -0,0 +1,116 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.spectral;
+
+import java.io.IOException;
+import java.net.URI;
+import java.util.Arrays;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.filecache.DistributedCache;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileValueIterator;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+
+/**
+ * This class handles reading and writing vectors to the Hadoop
+ * distributed cache. Created as a result of Eigencuts' liberal use
+ * of such functionality, but available to any algorithm requiring it.
+ */
+public final class VectorCache {
+
+ private static final Logger log = LoggerFactory.getLogger(VectorCache.class);
+
+ private VectorCache() {
+ }
+
+ /**
+ * @param key SequenceFile key
+ * @param vector Vector to save, to be wrapped as VectorWritable
+ */
+ public static void save(Writable key,
+ Vector vector,
+ Path output,
+ Configuration conf,
+ boolean overwritePath,
+ boolean deleteOnExit) throws IOException {
+
+ FileSystem fs = FileSystem.get(output.toUri(), conf);
+ output = fs.makeQualified(output);
+ if (overwritePath) {
+ HadoopUtil.delete(conf, output);
+ }
+
+ // set the cache
+ DistributedCache.setCacheFiles(new URI[]{output.toUri()}, conf);
+
+ // set up the writer
+ try (SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, output,
+ IntWritable.class, VectorWritable.class)){
+ writer.append(key, new VectorWritable(vector));
+ }
+
+ if (deleteOnExit) {
+ fs.deleteOnExit(output);
+ }
+ }
+
+ /**
+ * Calls the save() method, setting the cache to overwrite any previous
+ * Path and to delete the path after exiting
+ */
+ public static void save(Writable key, Vector vector, Path output, Configuration conf) throws IOException {
+ save(key, vector, output, conf, true, true);
+ }
+
+ /**
+ * Loads the vector from {@link DistributedCache}. Returns null if no vector exists.
+ */
+ public static Vector load(Configuration conf) throws IOException {
+ Path[] files = HadoopUtil.getCachedFiles(conf);
+
+ if (files.length != 1) {
+ throw new IOException("Cannot read Frequency list from Distributed Cache (" + files.length + ')');
+ }
+
+ if (log.isInfoEnabled()) {
+ log.info("Files are: {}", Arrays.toString(files));
+ }
+ return load(conf, files[0]);
+ }
+
+ /**
+ * Loads a Vector from the specified path. Returns null if no vector exists.
+ */
+ public static Vector load(Configuration conf, Path input) throws IOException {
+ log.info("Loading vector from: {}", input);
+ try (SequenceFileValueIterator<VectorWritable> iterator =
+ new SequenceFileValueIterator<>(input, true, conf)){
+ return iterator.next().get();
+ }
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/VectorMatrixMultiplicationJob.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/VectorMatrixMultiplicationJob.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/VectorMatrixMultiplicationJob.java
new file mode 100644
index 0000000..c42ab70
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/VectorMatrixMultiplicationJob.java
@@ -0,0 +1,139 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.spectral;
+
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+import org.apache.mahout.math.function.Functions;
+import org.apache.mahout.math.hadoop.DistributedRowMatrix;
+
+/**
+ * <p>This class handles the three-way multiplication of the digonal matrix
+ * and the Markov transition matrix inherent in the Eigencuts algorithm.
+ * The equation takes the form:</p>
+ *
+ * {@code W = D^(1/2) * M * D^(1/2)}
+ *
+ * <p>Since the diagonal matrix D has only n non-zero elements, it is represented
+ * as a dense vector in this job, rather than a full n-by-n matrix. This job
+ * performs the multiplications and returns the new DRM.
+ */
+public final class VectorMatrixMultiplicationJob {
+
+ private VectorMatrixMultiplicationJob() {
+ }
+
+ /**
+ * Invokes the job.
+ * @param markovPath Path to the markov DRM's sequence files
+ */
+ public static DistributedRowMatrix runJob(Path markovPath, Vector diag, Path outputPath)
+ throws IOException, ClassNotFoundException, InterruptedException {
+
+ return runJob(markovPath, diag, outputPath, new Path(outputPath, "tmp"));
+ }
+
+ public static DistributedRowMatrix runJob(Path markovPath, Vector diag, Path outputPath, Path tmpPath)
+ throws IOException, ClassNotFoundException, InterruptedException {
+
+ // set up the serialization of the diagonal vector
+ Configuration conf = new Configuration();
+ FileSystem fs = FileSystem.get(markovPath.toUri(), conf);
+ markovPath = fs.makeQualified(markovPath);
+ outputPath = fs.makeQualified(outputPath);
+ Path vectorOutputPath = new Path(outputPath.getParent(), "vector");
+ VectorCache.save(new IntWritable(Keys.DIAGONAL_CACHE_INDEX), diag, vectorOutputPath, conf);
+
+ // set up the job itself
+ Job job = new Job(conf, "VectorMatrixMultiplication");
+ job.setInputFormatClass(SequenceFileInputFormat.class);
+ job.setOutputKeyClass(IntWritable.class);
+ job.setOutputValueClass(VectorWritable.class);
+ job.setOutputFormatClass(SequenceFileOutputFormat.class);
+ job.setMapperClass(VectorMatrixMultiplicationMapper.class);
+ job.setNumReduceTasks(0);
+
+ FileInputFormat.addInputPath(job, markovPath);
+ FileOutputFormat.setOutputPath(job, outputPath);
+
+ job.setJarByClass(VectorMatrixMultiplicationJob.class);
+
+ boolean succeeded = job.waitForCompletion(true);
+ if (!succeeded) {
+ throw new IllegalStateException("Job failed!");
+ }
+
+ // build the resulting DRM from the results
+ return new DistributedRowMatrix(outputPath, tmpPath,
+ diag.size(), diag.size());
+ }
+
+ public static class VectorMatrixMultiplicationMapper
+ extends Mapper<IntWritable, VectorWritable, IntWritable, VectorWritable> {
+
+ private Vector diagonal;
+
+ @Override
+ protected void setup(Context context) throws IOException, InterruptedException {
+ // read in the diagonal vector from the distributed cache
+ super.setup(context);
+ Configuration config = context.getConfiguration();
+ diagonal = VectorCache.load(config);
+ if (diagonal == null) {
+ throw new IOException("No vector loaded from cache!");
+ }
+ if (!(diagonal instanceof DenseVector)) {
+ diagonal = new DenseVector(diagonal);
+ }
+ }
+
+ @Override
+ protected void map(IntWritable key, VectorWritable row, Context ctx)
+ throws IOException, InterruptedException {
+
+ for (Vector.Element e : row.get().all()) {
+ double dii = Functions.SQRT.apply(diagonal.get(key.get()));
+ double djj = Functions.SQRT.apply(diagonal.get(e.index()));
+ double mij = e.get();
+ e.set(dii * mij * djj);
+ }
+ ctx.write(key, row);
+ }
+
+ /**
+ * Performs the setup of the Mapper. Used by unit tests.
+ * @param diag
+ */
+ void setup(Vector diag) {
+ this.diagonal = diag;
+ }
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/VertexWritable.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/VertexWritable.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/VertexWritable.java
new file mode 100644
index 0000000..0d70cac
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/VertexWritable.java
@@ -0,0 +1,101 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.spectral;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.hadoop.io.Writable;
+
+/**
+ * Represents a vertex within the affinity graph for Eigencuts.
+ */
+public class VertexWritable implements Writable {
+
+ /** the row */
+ private int i;
+
+ /** the column */
+ private int j;
+
+ /** the value at this vertex */
+ private double value;
+
+ /** an extra type delimeter, can probably be null */
+ private String type;
+
+ public VertexWritable() {
+ }
+
+ public VertexWritable(int i, int j, double v, String t) {
+ this.i = i;
+ this.j = j;
+ this.value = v;
+ this.type = t;
+ }
+
+ public int getRow() {
+ return i;
+ }
+
+ public void setRow(int i) {
+ this.i = i;
+ }
+
+ public int getCol() {
+ return j;
+ }
+
+ public void setCol(int j) {
+ this.j = j;
+ }
+
+ public double getValue() {
+ return value;
+ }
+
+ public void setValue(double v) {
+ this.value = v;
+ }
+
+ public String getType() {
+ return type;
+ }
+
+ public void setType(String t) {
+ this.type = t;
+ }
+
+ @Override
+ public void readFields(DataInput arg0) throws IOException {
+ this.i = arg0.readInt();
+ this.j = arg0.readInt();
+ this.value = arg0.readDouble();
+ this.type = arg0.readUTF();
+ }
+
+ @Override
+ public void write(DataOutput arg0) throws IOException {
+ arg0.writeInt(i);
+ arg0.writeInt(j);
+ arg0.writeDouble(value);
+ arg0.writeUTF(type);
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/kmeans/EigenSeedGenerator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/kmeans/EigenSeedGenerator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/kmeans/EigenSeedGenerator.java
new file mode 100644
index 0000000..3ce94dc
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/kmeans/EigenSeedGenerator.java
@@ -0,0 +1,120 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.spectral.kmeans;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.clustering.iterator.ClusterWritable;
+import org.apache.mahout.clustering.kmeans.Kluster;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Given an Input Path containing a {@link org.apache.hadoop.io.SequenceFile}, select k vectors and write them to the
+ * output file as a {@link org.apache.mahout.clustering.kmeans.Kluster} representing the initial centroid to use. The
+ * selection criterion is the rows with max value in that respective column
+ */
+public final class EigenSeedGenerator {
+
+ private static final Logger log = LoggerFactory.getLogger(EigenSeedGenerator.class);
+
+ public static final String K = "k";
+
+ private EigenSeedGenerator() {}
+
+ public static Path buildFromEigens(Configuration conf, Path input, Path output, int k, DistanceMeasure measure)
+ throws IOException {
+ // delete the output directory
+ FileSystem fs = FileSystem.get(output.toUri(), conf);
+ HadoopUtil.delete(conf, output);
+ Path outFile = new Path(output, "part-eigenSeed");
+ boolean newFile = fs.createNewFile(outFile);
+ if (newFile) {
+ Path inputPathPattern;
+
+ if (fs.getFileStatus(input).isDir()) {
+ inputPathPattern = new Path(input, "*");
+ } else {
+ inputPathPattern = input;
+ }
+
+ FileStatus[] inputFiles = fs.globStatus(inputPathPattern, PathFilters.logsCRCFilter());
+ Map<Integer,Double> maxEigens = new HashMap<>(k); // store
+ // max
+ // value
+ // of
+ // each
+ // column
+ Map<Integer,Text> chosenTexts = new HashMap<>(k);
+ Map<Integer,ClusterWritable> chosenClusters = new HashMap<>(k);
+
+ for (FileStatus fileStatus : inputFiles) {
+ if (!fileStatus.isDir()) {
+ for (Pair<Writable,VectorWritable> record : new SequenceFileIterable<Writable,VectorWritable>(
+ fileStatus.getPath(), true, conf)) {
+ Writable key = record.getFirst();
+ VectorWritable value = record.getSecond();
+
+ for (Vector.Element e : value.get().nonZeroes()) {
+ int index = e.index();
+ double v = Math.abs(e.get());
+
+ if (!maxEigens.containsKey(index) || v > maxEigens.get(index)) {
+ maxEigens.put(index, v);
+ Text newText = new Text(key.toString());
+ chosenTexts.put(index, newText);
+ Kluster newCluster = new Kluster(value.get(), index, measure);
+ newCluster.observe(value.get(), 1);
+ ClusterWritable clusterWritable = new ClusterWritable();
+ clusterWritable.setValue(newCluster);
+ chosenClusters.put(index, clusterWritable);
+ }
+ }
+ }
+ }
+ }
+
+ try (SequenceFile.Writer writer =
+ SequenceFile.createWriter(fs, conf, outFile, Text.class, ClusterWritable.class)){
+ for (Integer key : maxEigens.keySet()) {
+ writer.append(chosenTexts.get(key), chosenClusters.get(key));
+ }
+ log.info("EigenSeedGenerator:: Wrote {} Klusters to {}", chosenTexts.size(), outFile);
+ }
+ }
+
+ return outFile;
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/kmeans/SpectralKMeansDriver.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/kmeans/SpectralKMeansDriver.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/kmeans/SpectralKMeansDriver.java
new file mode 100644
index 0000000..427de91
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/spectral/kmeans/SpectralKMeansDriver.java
@@ -0,0 +1,243 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.spectral.kmeans;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.clustering.Cluster;
+import org.apache.mahout.clustering.classify.WeightedVectorWritable;
+import org.apache.mahout.clustering.kmeans.KMeansDriver;
+import org.apache.mahout.clustering.spectral.AffinityMatrixInputJob;
+import org.apache.mahout.clustering.spectral.MatrixDiagonalizeJob;
+import org.apache.mahout.clustering.spectral.UnitVectorizerJob;
+import org.apache.mahout.clustering.spectral.VectorMatrixMultiplicationJob;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.ClassUtils;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.hadoop.DistributedRowMatrix;
+import org.apache.mahout.math.hadoop.stochasticsvd.SSVDSolver;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Performs spectral k-means clustering on the top k eigenvectors of the input affinity matrix.
+ */
+public class SpectralKMeansDriver extends AbstractJob {
+ private static final Logger log = LoggerFactory.getLogger(SpectralKMeansDriver.class);
+
+ public static final int REDUCERS = 10;
+ public static final int BLOCKHEIGHT = 30000;
+ public static final int OVERSAMPLING = 15;
+ public static final int POWERITERS = 0;
+
+ public static void main(String[] args) throws Exception {
+ ToolRunner.run(new SpectralKMeansDriver(), args);
+ }
+
+ @Override
+ public int run(String[] arg0) throws Exception {
+
+ Configuration conf = getConf();
+ addInputOption();
+ addOutputOption();
+ addOption("dimensions", "d", "Square dimensions of affinity matrix", true);
+ addOption("clusters", "k", "Number of clusters and top eigenvectors", true);
+ addOption(DefaultOptionCreator.distanceMeasureOption().create());
+ addOption(DefaultOptionCreator.convergenceOption().create());
+ addOption(DefaultOptionCreator.maxIterationsOption().create());
+ addOption(DefaultOptionCreator.overwriteOption().create());
+ addFlag("usessvd", "ssvd", "Uses SSVD as the eigensolver. Default is the Lanczos solver.");
+ addOption("reduceTasks", "t", "Number of reducers for SSVD", String.valueOf(REDUCERS));
+ addOption("outerProdBlockHeight", "oh", "Block height of outer products for SSVD", String.valueOf(BLOCKHEIGHT));
+ addOption("oversampling", "p", "Oversampling parameter for SSVD", String.valueOf(OVERSAMPLING));
+ addOption("powerIter", "q", "Additional power iterations for SSVD", String.valueOf(POWERITERS));
+
+ Map<String, List<String>> parsedArgs = parseArguments(arg0);
+ if (parsedArgs == null) {
+ return 0;
+ }
+
+ Path input = getInputPath();
+ Path output = getOutputPath();
+ if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
+ HadoopUtil.delete(conf, getTempPath());
+ HadoopUtil.delete(conf, getOutputPath());
+ }
+ int numDims = Integer.parseInt(getOption("dimensions"));
+ int clusters = Integer.parseInt(getOption("clusters"));
+ String measureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION);
+ DistanceMeasure measure = ClassUtils.instantiateAs(measureClass, DistanceMeasure.class);
+ double convergenceDelta = Double.parseDouble(getOption(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION));
+ int maxIterations = Integer.parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION));
+
+ Path tempdir = new Path(getOption("tempDir"));
+ int reducers = Integer.parseInt(getOption("reduceTasks"));
+ int blockheight = Integer.parseInt(getOption("outerProdBlockHeight"));
+ int oversampling = Integer.parseInt(getOption("oversampling"));
+ int poweriters = Integer.parseInt(getOption("powerIter"));
+ run(conf, input, output, numDims, clusters, measure, convergenceDelta, maxIterations, tempdir, reducers,
+ blockheight, oversampling, poweriters);
+
+ return 0;
+ }
+
+ public static void run(Configuration conf, Path input, Path output, int numDims, int clusters,
+ DistanceMeasure measure, double convergenceDelta, int maxIterations, Path tempDir)
+ throws IOException, InterruptedException, ClassNotFoundException {
+ run(conf, input, output, numDims, clusters, measure, convergenceDelta, maxIterations, tempDir, REDUCERS,
+ BLOCKHEIGHT, OVERSAMPLING, POWERITERS);
+ }
+
+ /**
+ * Run the Spectral KMeans clustering on the supplied arguments
+ *
+ * @param conf
+ * the Configuration to be used
+ * @param input
+ * the Path to the input tuples directory
+ * @param output
+ * the Path to the output directory
+ * @param numDims
+ * the int number of dimensions of the affinity matrix
+ * @param clusters
+ * the int number of eigenvectors and thus clusters to produce
+ * @param measure
+ * the DistanceMeasure for the k-Means calculations
+ * @param convergenceDelta
+ * the double convergence delta for the k-Means calculations
+ * @param maxIterations
+ * the int maximum number of iterations for the k-Means calculations
+ * @param tempDir
+ * Temporary directory for intermediate calculations
+ * @param numReducers
+ * Number of reducers
+ * @param blockHeight
+ * @param oversampling
+ * @param poweriters
+ */
+ public static void run(Configuration conf, Path input, Path output, int numDims, int clusters,
+ DistanceMeasure measure, double convergenceDelta, int maxIterations, Path tempDir,
+ int numReducers, int blockHeight, int oversampling, int poweriters)
+ throws IOException, InterruptedException, ClassNotFoundException {
+
+ HadoopUtil.delete(conf, tempDir);
+ Path outputCalc = new Path(tempDir, "calculations");
+ Path outputTmp = new Path(tempDir, "temporary");
+
+ // Take in the raw CSV text file and split it ourselves,
+ // creating our own SequenceFiles for the matrices to read later
+ // (similar to the style of syntheticcontrol.canopy.InputMapper)
+ Path affSeqFiles = new Path(outputCalc, "seqfile");
+ AffinityMatrixInputJob.runJob(input, affSeqFiles, numDims, numDims);
+
+ // Construct the affinity matrix using the newly-created sequence files
+ DistributedRowMatrix A = new DistributedRowMatrix(affSeqFiles, new Path(outputTmp, "afftmp"), numDims, numDims);
+
+ Configuration depConf = new Configuration(conf);
+ A.setConf(depConf);
+
+ // Construct the diagonal matrix D (represented as a vector)
+ Vector D = MatrixDiagonalizeJob.runJob(affSeqFiles, numDims);
+
+ // Calculate the normalized Laplacian of the form: L = D^(-0.5)AD^(-0.5)
+ DistributedRowMatrix L = VectorMatrixMultiplicationJob.runJob(affSeqFiles, D, new Path(outputCalc, "laplacian"),
+ new Path(outputCalc, outputCalc));
+ L.setConf(depConf);
+
+ Path data;
+
+ // SSVD requires an array of Paths to function. So we pass in an array of length one
+ Path[] LPath = new Path[1];
+ LPath[0] = L.getRowPath();
+
+ Path SSVDout = new Path(outputCalc, "SSVD");
+
+ SSVDSolver solveIt = new SSVDSolver(depConf, LPath, SSVDout, blockHeight, clusters, oversampling, numReducers);
+
+ solveIt.setComputeV(false);
+ solveIt.setComputeU(true);
+ solveIt.setOverwrite(true);
+ solveIt.setQ(poweriters);
+ // solveIt.setBroadcast(false);
+ solveIt.run();
+ data = new Path(solveIt.getUPath());
+
+ // Normalize the rows of Wt to unit length
+ // normalize is important because it reduces the occurrence of two unique clusters combining into one
+ Path unitVectors = new Path(outputCalc, "unitvectors");
+
+ UnitVectorizerJob.runJob(data, unitVectors);
+
+ DistributedRowMatrix Wt = new DistributedRowMatrix(unitVectors, new Path(unitVectors, "tmp"), clusters, numDims);
+ Wt.setConf(depConf);
+ data = Wt.getRowPath();
+
+ // Generate initial clusters using EigenSeedGenerator which picks rows as centroids if that row contains max
+ // eigen value in that column
+ Path initialclusters = EigenSeedGenerator.buildFromEigens(conf, data,
+ new Path(output, Cluster.INITIAL_CLUSTERS_DIR), clusters, measure);
+
+ // Run the KMeansDriver
+ Path answer = new Path(output, "kmeans_out");
+ KMeansDriver.run(conf, data, initialclusters, answer, convergenceDelta, maxIterations, true, 0.0, false);
+
+ // Restore name to id mapping and read through the cluster assignments
+ Path mappingPath = new Path(new Path(conf.get("hadoop.tmp.dir")), "generic_input_mapping");
+ List<String> mapping = new ArrayList<>();
+ FileSystem fs = FileSystem.get(mappingPath.toUri(), conf);
+ if (fs.exists(mappingPath)) {
+ SequenceFile.Reader reader = new SequenceFile.Reader(fs, mappingPath, conf);
+ Text mappingValue = new Text();
+ IntWritable mappingIndex = new IntWritable();
+ while (reader.next(mappingIndex, mappingValue)) {
+ String s = mappingValue.toString();
+ mapping.add(s);
+ }
+ HadoopUtil.delete(conf, mappingPath);
+ } else {
+ log.warn("generic input mapping file not found!");
+ }
+
+ Path clusteredPointsPath = new Path(answer, "clusteredPoints");
+ Path inputPath = new Path(clusteredPointsPath, "part-m-00000");
+ int id = 0;
+ for (Pair<IntWritable, WeightedVectorWritable> record :
+ new SequenceFileIterable<IntWritable, WeightedVectorWritable>(inputPath, conf)) {
+ if (!mapping.isEmpty()) {
+ log.info("{}: {}", mapping.get(id++), record.getFirst().get());
+ } else {
+ log.info("{}: {}", id++, record.getFirst().get());
+ }
+ }
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/cluster/BallKMeans.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/cluster/BallKMeans.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/cluster/BallKMeans.java
new file mode 100644
index 0000000..25806fe
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/cluster/BallKMeans.java
@@ -0,0 +1,456 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.streaming.cluster;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Random;
+
+import com.google.common.base.Function;
+import com.google.common.base.Preconditions;
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Iterators;
+import org.apache.mahout.clustering.ClusteringUtils;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.math.Centroid;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.WeightedVector;
+import org.apache.mahout.math.neighborhood.UpdatableSearcher;
+import org.apache.mahout.math.random.Multinomial;
+import org.apache.mahout.math.random.WeightedThing;
+
+/**
+ * Implements a ball k-means algorithm for weighted vectors with probabilistic seeding similar to k-means++.
+ * The idea is that k-means++ gives good starting clusters and ball k-means can tune up the final result very nicely
+ * in only a few passes (or even in a single iteration for well-clusterable data).
+ *
+ * A good reference for this class of algorithms is "The Effectiveness of Lloyd-Type Methods for the k-Means Problem"
+ * by Rafail Ostrovsky, Yuval Rabani, Leonard J. Schulman and Chaitanya Swamy. The code here uses the seeding strategy
+ * as described in section 4.1.1 of that paper and the ball k-means step as described in section 4.2. We support
+ * multiple iterations in contrast to the algorithm described in the paper.
+ */
+public class BallKMeans implements Iterable<Centroid> {
+ /**
+ * The searcher containing the centroids.
+ */
+ private final UpdatableSearcher centroids;
+
+ /**
+ * The number of clusters to cluster the data into.
+ */
+ private final int numClusters;
+
+ /**
+ * The maximum number of iterations of the algorithm to run waiting for the cluster assignments
+ * to stabilize. If there are no changes in cluster assignment earlier, we can finish early.
+ */
+ private final int maxNumIterations;
+
+ /**
+ * When deciding which points to include in the new centroid calculation,
+ * it's preferable to exclude outliers since it increases the rate of convergence.
+ * So, we calculate the distance from each cluster to its closest neighboring cluster. When
+ * evaluating the points assigned to a cluster, we compare the distance between the centroid to
+ * the point with the distance between the centroid and its closest centroid neighbor
+ * multiplied by this trimFraction. If the distance between the centroid and the point is
+ * greater, we consider it an outlier and we don't use it.
+ */
+ private final double trimFraction;
+
+ /**
+ * Selecting the initial centroids is the most important part of the ball k-means clustering. Poor choices, like two
+ * centroids in the same actual cluster result in a low-quality final result.
+ * k-means++ initialization yields good quality clusters, especially when using BallKMeans after StreamingKMeans as
+ * the points have weights.
+ * Simple, random selection of the points based on their weights is faster but sometimes fails to produce the
+ * desired number of clusters.
+ * This field is true if the initialization should be done with k-means++.
+ */
+ private final boolean kMeansPlusPlusInit;
+
+ /**
+ * When using trimFraction, the weight of each centroid will not be the sum of the weights of
+ * the vectors assigned to that cluster because outliers are not used to compute the updated
+ * centroid.
+ * So, the total weight is probably wrong. This can be fixed by doing another pass over the
+ * data points and adjusting the weights of each centroid. This doesn't update the coordinates
+ * of the centroids, but is useful if the weights matter.
+ */
+ private final boolean correctWeights;
+
+ /**
+ * When running multiple ball k-means passes to get the one with the smallest total cost, can compute the
+ * overall cost, using all the points for clustering, or reserve a fraction of them, testProbability in a test set.
+ * The cost is the sum of the distances between each point and its corresponding centroid.
+ * We then use this set of points to compute the total cost on. We're therefore trying to select the clustering
+ * that best describes the underlying distribution of the clusters.
+ * This field is the probability of assigning a given point to the test set. If this is 0, the cost will be computed
+ * on the entire set of points.
+ */
+ private final double testProbability;
+
+ /**
+ * Whether or not testProbability > 0, i.e., there exists a non-empty 'test' set.
+ */
+ private final boolean splitTrainTest;
+
+ /**
+ * How many k-means runs to have. If there's more than one run, we compute the cost of each clustering as described
+ * above and select the clustering that minimizes the cost.
+ * Multiple runs are a lot more useful when using the random initialization. With kmeans++, 1-2 runs are enough and
+ * more runs are not likely to help quality much.
+ */
+ private final int numRuns;
+
+ /**
+ * Random object to sample values from.
+ */
+ private final Random random;
+
+ public BallKMeans(UpdatableSearcher searcher, int numClusters, int maxNumIterations) {
+ // By default, the trimFraction is 0.9, k-means++ is used, the weights will be corrected at the end,
+ // there will be 0 points in the test set and 1 run.
+ this(searcher, numClusters, maxNumIterations, 0.9, true, true, 0.0, 1);
+ }
+
+ public BallKMeans(UpdatableSearcher searcher, int numClusters, int maxNumIterations,
+ boolean kMeansPlusPlusInit, int numRuns) {
+ // By default, the trimFraction is 0.9, k-means++ is used, the weights will be corrected at the end,
+ // there will be 10% points of in the test set.
+ this(searcher, numClusters, maxNumIterations, 0.9, kMeansPlusPlusInit, true, 0.1, numRuns);
+ }
+
+ public BallKMeans(UpdatableSearcher searcher, int numClusters, int maxNumIterations,
+ double trimFraction, boolean kMeansPlusPlusInit, boolean correctWeights,
+ double testProbability, int numRuns) {
+ Preconditions.checkArgument(searcher.size() == 0, "Searcher must be empty initially to populate with centroids");
+ Preconditions.checkArgument(numClusters > 0, "The requested number of clusters must be positive");
+ Preconditions.checkArgument(maxNumIterations > 0, "The maximum number of iterations must be positive");
+ Preconditions.checkArgument(trimFraction > 0, "The trim fraction must be positive");
+ Preconditions.checkArgument(testProbability >= 0 && testProbability < 1, "The testProbability must be in [0, 1)");
+ Preconditions.checkArgument(numRuns > 0, "There has to be at least one run");
+
+ this.centroids = searcher;
+ this.numClusters = numClusters;
+ this.maxNumIterations = maxNumIterations;
+
+ this.trimFraction = trimFraction;
+ this.kMeansPlusPlusInit = kMeansPlusPlusInit;
+ this.correctWeights = correctWeights;
+
+ this.testProbability = testProbability;
+ this.splitTrainTest = testProbability > 0;
+ this.numRuns = numRuns;
+
+ this.random = RandomUtils.getRandom();
+ }
+
+ public Pair<List<? extends WeightedVector>, List<? extends WeightedVector>> splitTrainTest(
+ List<? extends WeightedVector> datapoints) {
+ // If there will be no points assigned to the test set, return now.
+ if (testProbability == 0) {
+ return new Pair<List<? extends WeightedVector>, List<? extends WeightedVector>>(datapoints,
+ new ArrayList<WeightedVector>());
+ }
+
+ int numTest = (int) (testProbability * datapoints.size());
+ Preconditions.checkArgument(numTest > 0 && numTest < datapoints.size(),
+ "Must have nonzero number of training and test vectors. Asked for %.1f %% of %d vectors for test",
+ testProbability * 100, datapoints.size());
+
+ Collections.shuffle(datapoints);
+ return new Pair<List<? extends WeightedVector>, List<? extends WeightedVector>>(
+ datapoints.subList(numTest, datapoints.size()), datapoints.subList(0, numTest));
+ }
+
+ /**
+ * Clusters the datapoints in the list doing either random seeding of the centroids or k-means++.
+ *
+ * @param datapoints the points to be clustered.
+ * @return an UpdatableSearcher with the resulting clusters.
+ */
+ public UpdatableSearcher cluster(List<? extends WeightedVector> datapoints) {
+ Pair<List<? extends WeightedVector>, List<? extends WeightedVector>> trainTestSplit = splitTrainTest(datapoints);
+ List<Vector> bestCentroids = new ArrayList<>();
+ double cost = Double.POSITIVE_INFINITY;
+ double bestCost = Double.POSITIVE_INFINITY;
+ for (int i = 0; i < numRuns; ++i) {
+ centroids.clear();
+ if (kMeansPlusPlusInit) {
+ // Use k-means++ to set initial centroids.
+ initializeSeedsKMeansPlusPlus(trainTestSplit.getFirst());
+ } else {
+ // Randomly select the initial centroids.
+ initializeSeedsRandomly(trainTestSplit.getFirst());
+ }
+ // Do k-means iterations with trimmed mean computation (aka ball k-means).
+ if (numRuns > 1) {
+ // If the clustering is successful (there are no zero-weight centroids).
+ iterativeAssignment(trainTestSplit.getFirst());
+ // Compute the cost of the clustering and possibly save the centroids.
+ cost = ClusteringUtils.totalClusterCost(
+ splitTrainTest ? datapoints : trainTestSplit.getSecond(), centroids);
+ if (cost < bestCost) {
+ bestCost = cost;
+ bestCentroids.clear();
+ Iterables.addAll(bestCentroids, centroids);
+ }
+ } else {
+ // If there is only going to be one run, the cost doesn't need to be computed, so we just return the clustering.
+ iterativeAssignment(datapoints);
+ return centroids;
+ }
+ }
+ if (bestCost == Double.POSITIVE_INFINITY) {
+ throw new RuntimeException("No valid clustering was found");
+ }
+ if (cost != bestCost) {
+ centroids.clear();
+ centroids.addAll(bestCentroids);
+ }
+ if (correctWeights) {
+ for (WeightedVector testDatapoint : trainTestSplit.getSecond()) {
+ WeightedVector closest = (WeightedVector) centroids.searchFirst(testDatapoint, false).getValue();
+ closest.setWeight(closest.getWeight() + testDatapoint.getWeight());
+ }
+ }
+ return centroids;
+ }
+
+ /**
+ * Selects some of the original points randomly with probability proportional to their weights. This is much
+ * less sophisticated than the kmeans++ approach, however it is faster and coupled with
+ *
+ * The side effect of this method is to fill the centroids structure itself.
+ *
+ * @param datapoints The datapoints to select from. These datapoints should be WeightedVectors of some kind.
+ */
+ private void initializeSeedsRandomly(List<? extends WeightedVector> datapoints) {
+ int numDatapoints = datapoints.size();
+ double totalWeight = 0;
+ for (WeightedVector datapoint : datapoints) {
+ totalWeight += datapoint.getWeight();
+ }
+ Multinomial<Integer> seedSelector = new Multinomial<>();
+ for (int i = 0; i < numDatapoints; ++i) {
+ seedSelector.add(i, datapoints.get(i).getWeight() / totalWeight);
+ }
+ for (int i = 0; i < numClusters; ++i) {
+ int sample = seedSelector.sample();
+ seedSelector.delete(sample);
+ Centroid centroid = new Centroid(datapoints.get(sample));
+ centroid.setIndex(i);
+ centroids.add(centroid);
+ }
+ }
+
+ /**
+ * Selects some of the original points according to the k-means++ algorithm. The basic idea is that
+ * points are selected with probability proportional to their distance from any selected point. In
+ * this version, points have weights which multiply their likelihood of being selected. This is the
+ * same as if there were as many copies of the same point as indicated by the weight.
+ *
+ * This is pretty expensive, but it vastly improves the quality and convergences of the k-means algorithm.
+ * The basic idea can be made much faster by only processing a random subset of the original points.
+ * In the context of streaming k-means, the total number of possible seeds will be about k log n so this
+ * selection will cost O(k^2 (log n)^2) which isn't much worse than the random sampling idea. At
+ * n = 10^9, the cost of this initialization will be about 10x worse than a reasonable random sampling
+ * implementation.
+ *
+ * The side effect of this method is to fill the centroids structure itself.
+ *
+ * @param datapoints The datapoints to select from. These datapoints should be WeightedVectors of some kind.
+ */
+ private void initializeSeedsKMeansPlusPlus(List<? extends WeightedVector> datapoints) {
+ Preconditions.checkArgument(datapoints.size() > 1, "Must have at least two datapoints points to cluster " +
+ "sensibly");
+ Preconditions.checkArgument(datapoints.size() >= numClusters,
+ String.format("Must have more datapoints [%d] than clusters [%d]", datapoints.size(), numClusters));
+ // Compute the centroid of all of the datapoints. This is then used to compute the squared radius of the datapoints.
+ Centroid center = new Centroid(datapoints.iterator().next());
+ for (WeightedVector row : Iterables.skip(datapoints, 1)) {
+ center.update(row);
+ }
+
+ // Given the centroid, we can compute \Delta_1^2(X), the total squared distance for the datapoints
+ // this accelerates seed selection.
+ double deltaX = 0;
+ DistanceMeasure distanceMeasure = centroids.getDistanceMeasure();
+ for (WeightedVector row : datapoints) {
+ deltaX += distanceMeasure.distance(row, center);
+ }
+
+ // Find the first seed c_1 (and conceptually the second, c_2) as might be done in the 2-means clustering so that
+ // the probability of selecting c_1 and c_2 is proportional to || c_1 - c_2 ||^2. This is done
+ // by first selecting c_1 with probability:
+ //
+ // p(c_1) = sum_{c_1} || c_1 - c_2 ||^2 \over sum_{c_1, c_2} || c_1 - c_2 ||^2
+ //
+ // This can be simplified to:
+ //
+ // p(c_1) = \Delta_1^2(X) + n || c_1 - c ||^2 / (2 n \Delta_1^2(X))
+ //
+ // where c = \sum x / n and \Delta_1^2(X) = sum || x - c ||^2
+ //
+ // All subsequent seeds c_i (including c_2) can then be selected from the remaining points with probability
+ // proportional to Pr(c_i == x_j) = min_{m < i} || c_m - x_j ||^2.
+
+ // Multinomial distribution of vector indices for the selection seeds. These correspond to
+ // the indices of the vectors in the original datapoints list.
+ Multinomial<Integer> seedSelector = new Multinomial<>();
+ for (int i = 0; i < datapoints.size(); ++i) {
+ double selectionProbability =
+ deltaX + datapoints.size() * distanceMeasure.distance(datapoints.get(i), center);
+ seedSelector.add(i, selectionProbability);
+ }
+
+ int selected = random.nextInt(datapoints.size());
+ Centroid c_1 = new Centroid(datapoints.get(selected).clone());
+ c_1.setIndex(0);
+ // Construct a set of weighted things which can be used for random selection. Initial weights are
+ // set to the squared distance from c_1
+ for (int i = 0; i < datapoints.size(); ++i) {
+ WeightedVector row = datapoints.get(i);
+ double w = distanceMeasure.distance(c_1, row) * 2 * Math.log(1 + row.getWeight());
+ seedSelector.set(i, w);
+ }
+
+ // From here, seeds are selected with probability proportional to:
+ //
+ // r_i = min_{c_j} || x_i - c_j ||^2
+ //
+ // when we only have c_1, we have already set these distances and as we select each new
+ // seed, we update the minimum distances.
+ centroids.add(c_1);
+ int clusterIndex = 1;
+ while (centroids.size() < numClusters) {
+ // Select according to weights.
+ int seedIndex = seedSelector.sample();
+ Centroid nextSeed = new Centroid(datapoints.get(seedIndex));
+ nextSeed.setIndex(clusterIndex++);
+ centroids.add(nextSeed);
+ // Don't select this one again.
+ seedSelector.delete(seedIndex);
+ // Re-weight everything according to the minimum distance to a seed.
+ for (int currSeedIndex : seedSelector) {
+ WeightedVector curr = datapoints.get(currSeedIndex);
+ double newWeight = nextSeed.getWeight() * distanceMeasure.distance(nextSeed, curr);
+ if (newWeight < seedSelector.getWeight(currSeedIndex)) {
+ seedSelector.set(currSeedIndex, newWeight);
+ }
+ }
+ }
+ }
+
+ /**
+ * Examines the datapoints and updates cluster centers to be the centroid of the nearest datapoints points. To
+ * compute a new center for cluster c_i, we average all points that are closer than d_i * trimFraction
+ * where d_i is
+ *
+ * d_i = min_j \sqrt ||c_j - c_i||^2
+ *
+ * By ignoring distant points, the centroids converge more quickly to a good approximation of the
+ * optimal k-means solution (given good starting points).
+ *
+ * @param datapoints the points to cluster.
+ */
+ private void iterativeAssignment(List<? extends WeightedVector> datapoints) {
+ DistanceMeasure distanceMeasure = centroids.getDistanceMeasure();
+ // closestClusterDistances.get(i) is the distance from the i'th cluster to its closest
+ // neighboring cluster.
+ List<Double> closestClusterDistances = new ArrayList<>(numClusters);
+ // clusterAssignments[i] == j means that the i'th point is assigned to the j'th cluster. When
+ // these don't change, we are done.
+ // Each point is assigned to the invalid "-1" cluster initially.
+ List<Integer> clusterAssignments = new ArrayList<>(Collections.nCopies(datapoints.size(), -1));
+
+ boolean changed = true;
+ for (int i = 0; changed && i < maxNumIterations; i++) {
+ changed = false;
+ // We compute what the distance between each cluster and its closest neighbor is to set a
+ // proportional distance threshold for points that should be involved in calculating the
+ // centroid.
+ closestClusterDistances.clear();
+ for (Vector center : centroids) {
+ // If a centroid has no points assigned to it, the clustering failed.
+ Vector closestOtherCluster = centroids.searchFirst(center, true).getValue();
+ closestClusterDistances.add(distanceMeasure.distance(center, closestOtherCluster));
+ }
+
+ // Copies the current cluster centroids to newClusters and sets their weights to 0. This is
+ // so we calculate the new centroids as we go through the datapoints.
+ List<Centroid> newCentroids = new ArrayList<>();
+ for (Vector centroid : centroids) {
+ // need a deep copy because we will mutate these values
+ Centroid newCentroid = (Centroid)centroid.clone();
+ newCentroid.setWeight(0);
+ newCentroids.add(newCentroid);
+ }
+
+ // Pass over the datapoints computing new centroids.
+ for (int j = 0; j < datapoints.size(); ++j) {
+ WeightedVector datapoint = datapoints.get(j);
+ // Get the closest cluster this point belongs to.
+ WeightedThing<Vector> closestPair = centroids.searchFirst(datapoint, false);
+ int closestIndex = ((WeightedVector) closestPair.getValue()).getIndex();
+ double closestDistance = closestPair.getWeight();
+ // Update its cluster assignment if necessary.
+ if (closestIndex != clusterAssignments.get(j)) {
+ changed = true;
+ clusterAssignments.set(j, closestIndex);
+ }
+ // Only update if the datapoints point is near enough. What this means is that the weight
+ // of outliers is NOT taken into account and the final weights of the centroids will
+ // reflect this (it will be less or equal to the initial sum of the weights).
+ if (closestDistance < trimFraction * closestClusterDistances.get(closestIndex)) {
+ newCentroids.get(closestIndex).update(datapoint);
+ }
+ }
+ // Add the new centers back into searcher.
+ centroids.clear();
+ centroids.addAll(newCentroids);
+ }
+
+ if (correctWeights) {
+ for (Vector v : centroids) {
+ ((Centroid)v).setWeight(0);
+ }
+ for (WeightedVector datapoint : datapoints) {
+ Centroid closestCentroid = (Centroid) centroids.searchFirst(datapoint, false).getValue();
+ closestCentroid.setWeight(closestCentroid.getWeight() + datapoint.getWeight());
+ }
+ }
+ }
+
+ @Override
+ public Iterator<Centroid> iterator() {
+ return Iterators.transform(centroids.iterator(), new Function<Vector, Centroid>() {
+ @Override
+ public Centroid apply(Vector input) {
+ Preconditions.checkArgument(input instanceof Centroid, "Non-centroid in centroids " +
+ "searcher");
+ //noinspection ConstantConditions
+ return (Centroid)input;
+ }
+ });
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/cluster/StreamingKMeans.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/cluster/StreamingKMeans.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/cluster/StreamingKMeans.java
new file mode 100644
index 0000000..604bc9d
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/cluster/StreamingKMeans.java
@@ -0,0 +1,368 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.streaming.cluster;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Random;
+
+import com.google.common.base.Function;
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Iterators;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.math.Centroid;
+import org.apache.mahout.math.Matrix;
+import org.apache.mahout.math.MatrixSlice;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.jet.math.Constants;
+import org.apache.mahout.math.neighborhood.UpdatableSearcher;
+import org.apache.mahout.math.random.WeightedThing;
+
+/**
+ * Implements a streaming k-means algorithm for weighted vectors.
+ * The goal clustering points one at a time, especially useful for MapReduce mappers that get inputs one at a time.
+ *
+ * A rough description of the algorithm:
+ * Suppose there are l clusters at one point and a new point p is added.
+ * The new point can either be added to one of the existing l clusters or become a new cluster. To decide:
+ * - let c be the closest cluster to point p;
+ * - let d be the distance between c and p;
+ * - if d > distanceCutoff, create a new cluster from p (p is too far away from the clusters to be part of them;
+ * distanceCutoff represents the largest distance from a point its assigned cluster's centroid);
+ * - else (d <= distanceCutoff), create a new cluster with probability d / distanceCutoff (the probability of creating
+ * a new cluster increases as d increases).
+ * There will be either l points or l + 1 points after processing a new point.
+ *
+ * As the number of clusters increases, it will go over the numClusters limit (numClusters represents a recommendation
+ * for the number of clusters that there should be at the end). To decrease the number of clusters the existing clusters
+ * are treated as data points and are re-clustered (collapsed). This tends to make the number of clusters go down.
+ * If the number of clusters is still too high, distanceCutoff is increased.
+ *
+ * For more details, see:
+ * - "Streaming k-means approximation" by N. Ailon, R. Jaiswal, C. Monteleoni
+ * http://books.nips.cc/papers/files/nips22/NIPS2009_1085.pdf
+ * - "Fast and Accurate k-means for Large Datasets" by M. Shindler, A. Wong, A. Meyerson,
+ * http://books.nips.cc/papers/files/nips24/NIPS2011_1271.pdf
+ */
+public class StreamingKMeans implements Iterable<Centroid> {
+ /**
+ * The searcher containing the centroids that resulted from the clustering of points until now. When adding a new
+ * point we either assign it to one of the existing clusters in this searcher or create a new centroid for it.
+ */
+ private final UpdatableSearcher centroids;
+
+ /**
+ * The estimated number of clusters to cluster the data in. If the actual number of clusters increases beyond this
+ * limit, the clusters will be "collapsed" (re-clustered, by treating them as data points). This doesn't happen
+ * recursively and a collapse might not necessarily make the number of actual clusters drop to less than this limit.
+ *
+ * If the goal is clustering a large data set into k clusters, numClusters SHOULD NOT BE SET to k. StreamingKMeans is
+ * useful to reduce the size of the data set by the mappers so that it can fit into memory in one reducer that runs
+ * BallKMeans.
+ *
+ * It is NOT MEANT to cluster the data into k clusters in one pass because it can't guarantee that there will in fact
+ * be k clusters in total. This is because of the dynamic nature of numClusters over the course of the runtime.
+ * To get an exact number of clusters, another clustering algorithm needs to be applied to the results.
+ */
+ private int numClusters;
+
+ /**
+ * The number of data points seen so far. This is important for re-estimating numClusters when deciding to collapse
+ * the existing clusters.
+ */
+ private int numProcessedDatapoints = 0;
+
+ /**
+ * This is the current value of the distance cutoff. Points which are much closer than this to a centroid will stick
+ * to it almost certainly. Points further than this to any centroid will form a new cluster.
+ *
+ * This increases (is multiplied by beta) when a cluster collapse did not make the number of clusters drop to below
+ * numClusters (it effectively increases the tolerance for cluster compactness discouraging the creation of new
+ * clusters). Since a collapse only happens when centroids.size() > clusterOvershoot * numClusters, the cutoff
+ * increases when the collapse didn't at least remove the slack in the number of clusters.
+ */
+ private double distanceCutoff;
+
+ /**
+ * Parameter that controls the growth of the distanceCutoff. After n increases of the
+ * distanceCutoff starting at d_0, the final value is d_0 * beta^n (distance cutoffs increase following a geometric
+ * progression with ratio beta).
+ */
+ private final double beta;
+
+ /**
+ * Multiplying clusterLogFactor with numProcessedDatapoints gets an estimate of the suggested
+ * number of clusters. This mirrors the recommended number of clusters for n points where there should be k actual
+ * clusters, k * log n. In the case of our estimate we use clusterLogFactor * log(numProcessedDataPoints).
+ *
+ * It is important to note that numClusters is NOT k. It is an estimate of k * log n.
+ */
+ private final double clusterLogFactor;
+
+ /**
+ * Centroids are collapsed when the number of clusters becomes greater than clusterOvershoot * numClusters. This
+ * effectively means having a slack in numClusters so that the actual number of centroids, centroids.size() tracks
+ * numClusters approximately. The idea is that the actual number of clusters should be at least numClusters but not
+ * much more (so that we don't end up having 1 cluster / point).
+ */
+ private final double clusterOvershoot;
+
+ /**
+ * Random object to sample values from.
+ */
+ private final Random random = RandomUtils.getRandom();
+
+ /**
+ * Calls StreamingKMeans(searcher, numClusters, 1.3, 10, 2).
+ * @see StreamingKMeans#StreamingKMeans(org.apache.mahout.math.neighborhood.UpdatableSearcher, int,
+ * double, double, double, double)
+ */
+ public StreamingKMeans(UpdatableSearcher searcher, int numClusters) {
+ this(searcher, numClusters, 1.0 / numClusters, 1.3, 20, 2);
+ }
+
+ /**
+ * Calls StreamingKMeans(searcher, numClusters, distanceCutoff, 1.3, 10, 2).
+ * @see StreamingKMeans#StreamingKMeans(org.apache.mahout.math.neighborhood.UpdatableSearcher, int,
+ * double, double, double, double)
+ */
+ public StreamingKMeans(UpdatableSearcher searcher, int numClusters, double distanceCutoff) {
+ this(searcher, numClusters, distanceCutoff, 1.3, 20, 2);
+ }
+
+ /**
+ * Creates a new StreamingKMeans class given a searcher and the number of clusters to generate.
+ *
+ * @param searcher A Searcher that is used for performing nearest neighbor search. It MUST BE
+ * EMPTY initially because it will be used to keep track of the cluster
+ * centroids.
+ * @param numClusters An estimated number of clusters to generate for the data points.
+ * This can adjusted, but the actual number will depend on the data. The
+ * @param distanceCutoff The initial distance cutoff representing the value of the
+ * distance between a point and its closest centroid after which
+ * the new point will definitely be assigned to a new cluster.
+ * @param beta Ratio of geometric progression to use when increasing distanceCutoff. After n increases, distanceCutoff
+ * becomes distanceCutoff * beta^n. A smaller value increases the distanceCutoff less aggressively.
+ * @param clusterLogFactor Value multiplied with the number of points counted so far estimating the number of clusters
+ * to aim for. If the final number of clusters is known and this clustering is only for a
+ * sketch of the data, this can be the final number of clusters, k.
+ * @param clusterOvershoot Multiplicative slack factor for slowing down the collapse of the clusters.
+ */
+ public StreamingKMeans(UpdatableSearcher searcher, int numClusters,
+ double distanceCutoff, double beta, double clusterLogFactor, double clusterOvershoot) {
+ this.centroids = searcher;
+ this.numClusters = numClusters;
+ this.distanceCutoff = distanceCutoff;
+ this.beta = beta;
+ this.clusterLogFactor = clusterLogFactor;
+ this.clusterOvershoot = clusterOvershoot;
+ }
+
+ /**
+ * @return an Iterator to the Centroids contained in this clusterer.
+ */
+ @Override
+ public Iterator<Centroid> iterator() {
+ return Iterators.transform(centroids.iterator(), new Function<Vector, Centroid>() {
+ @Override
+ public Centroid apply(Vector input) {
+ return (Centroid)input;
+ }
+ });
+ }
+
+ /**
+ * Cluster the rows of a matrix, treating them as Centroids with weight 1.
+ * @param data matrix whose rows are to be clustered.
+ * @return the UpdatableSearcher containing the resulting centroids.
+ */
+ public UpdatableSearcher cluster(Matrix data) {
+ return cluster(Iterables.transform(data, new Function<MatrixSlice, Centroid>() {
+ @Override
+ public Centroid apply(MatrixSlice input) {
+ // The key in a Centroid is actually the MatrixSlice's index.
+ return Centroid.create(input.index(), input.vector());
+ }
+ }));
+ }
+
+ /**
+ * Cluster the data points in an Iterable<Centroid>.
+ * @param datapoints Iterable whose elements are to be clustered.
+ * @return the UpdatableSearcher containing the resulting centroids.
+ */
+ public UpdatableSearcher cluster(Iterable<Centroid> datapoints) {
+ return clusterInternal(datapoints, false);
+ }
+
+ /**
+ * Cluster one data point.
+ * @param datapoint to be clustered.
+ * @return the UpdatableSearcher containing the resulting centroids.
+ */
+ public UpdatableSearcher cluster(final Centroid datapoint) {
+ return cluster(new Iterable<Centroid>() {
+ @Override
+ public Iterator<Centroid> iterator() {
+ return new Iterator<Centroid>() {
+ private boolean accessed = false;
+
+ @Override
+ public boolean hasNext() {
+ return !accessed;
+ }
+
+ @Override
+ public Centroid next() {
+ accessed = true;
+ return datapoint;
+ }
+
+ @Override
+ public void remove() {
+ throw new UnsupportedOperationException();
+ }
+ };
+ }
+ });
+ }
+
+ /**
+ * @return the number of clusters computed from the points until now.
+ */
+ public int getNumClusters() {
+ return centroids.size();
+ }
+
+ /**
+ * Internal clustering method that gets called from the other wrappers.
+ * @param datapoints Iterable of data points to be clustered.
+ * @param collapseClusters whether this is an "inner" clustering and the datapoints are the previously computed
+ * centroids. Some logic is different to ensure counters are consistent but it behaves
+ * nearly the same.
+ * @return the UpdatableSearcher containing the resulting centroids.
+ */
+ private UpdatableSearcher clusterInternal(Iterable<Centroid> datapoints, boolean collapseClusters) {
+ Iterator<Centroid> datapointsIterator = datapoints.iterator();
+ if (!datapointsIterator.hasNext()) {
+ return centroids;
+ }
+
+ int oldNumProcessedDataPoints = numProcessedDatapoints;
+ // We clear the centroids we have in case of cluster collapse, the old clusters are the
+ // datapoints but we need to re-cluster them.
+ if (collapseClusters) {
+ centroids.clear();
+ numProcessedDatapoints = 0;
+ }
+
+ if (centroids.size() == 0) {
+ // Assign the first datapoint to the first cluster.
+ // Adding a vector to a searcher would normally just reference the copy,
+ // but we could potentially mutate it and so we need to make a clone.
+ centroids.add(datapointsIterator.next().clone());
+ ++numProcessedDatapoints;
+ }
+
+ // To cluster, we scan the data and either add each point to the nearest group or create a new group.
+ // when we get too many groups, we need to increase the threshold and rescan our current groups
+ while (datapointsIterator.hasNext()) {
+ Centroid row = datapointsIterator.next();
+ // Get the closest vector and its weight as a WeightedThing<Vector>.
+ // The weight of the WeightedThing is the distance to the query and the value is a
+ // reference to one of the vectors we added to the searcher previously.
+ WeightedThing<Vector> closestPair = centroids.searchFirst(row, false);
+
+ // We get a uniformly distributed random number between 0 and 1 and compare it with the
+ // distance to the closest cluster divided by the distanceCutoff.
+ // This is so that if the closest cluster is further than distanceCutoff,
+ // closestPair.getWeight() / distanceCutoff > 1 which will trigger the creation of a new
+ // cluster anyway.
+ // However, if the ratio is less than 1, we want to create a new cluster with probability
+ // proportional to the distance to the closest cluster.
+ double sample = random.nextDouble();
+ if (sample < row.getWeight() * closestPair.getWeight() / distanceCutoff) {
+ // Add new centroid, note that the vector is copied because we may mutate it later.
+ centroids.add(row.clone());
+ } else {
+ // Merge the new point with the existing centroid. This will update the centroid's actual
+ // position.
+ // We know that all the points we inserted in the centroids searcher are (or extend)
+ // WeightedVector, so the cast will always succeed.
+ Centroid centroid = (Centroid) closestPair.getValue();
+
+ // We will update the centroid by removing it from the searcher and reinserting it to
+ // ensure consistency.
+ if (!centroids.remove(centroid, Constants.EPSILON)) {
+ throw new RuntimeException("Unable to remove centroid");
+ }
+ centroid.update(row);
+ centroids.add(centroid);
+
+ }
+ ++numProcessedDatapoints;
+
+ if (!collapseClusters && centroids.size() > clusterOvershoot * numClusters) {
+ numClusters = (int) Math.max(numClusters, clusterLogFactor * Math.log(numProcessedDatapoints));
+
+ List<Centroid> shuffled = new ArrayList<>();
+ for (Vector vector : centroids) {
+ shuffled.add((Centroid) vector);
+ }
+ Collections.shuffle(shuffled);
+ // Re-cluster using the shuffled centroids as data points. The centroids member variable
+ // is modified directly.
+ clusterInternal(shuffled, true);
+
+ if (centroids.size() > numClusters) {
+ distanceCutoff *= beta;
+ }
+ }
+ }
+
+ if (collapseClusters) {
+ numProcessedDatapoints = oldNumProcessedDataPoints;
+ }
+ return centroids;
+ }
+
+ public void reindexCentroids() {
+ int numCentroids = 0;
+ for (Centroid centroid : this) {
+ centroid.setIndex(numCentroids++);
+ }
+ }
+
+ /**
+ * @return the distanceCutoff (an upper bound for the maximum distance within a cluster).
+ */
+ public double getDistanceCutoff() {
+ return distanceCutoff;
+ }
+
+ public void setDistanceCutoff(double distanceCutoff) {
+ this.distanceCutoff = distanceCutoff;
+ }
+
+ public DistanceMeasure getDistanceMeasure() {
+ return centroids.getDistanceMeasure();
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/mapreduce/CentroidWritable.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/mapreduce/CentroidWritable.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/mapreduce/CentroidWritable.java
new file mode 100644
index 0000000..a41940b
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/streaming/mapreduce/CentroidWritable.java
@@ -0,0 +1,88 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.streaming.mapreduce;
+
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.math.Centroid;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+public class CentroidWritable implements Writable {
+ private Centroid centroid = null;
+
+ public CentroidWritable() {}
+
+ public CentroidWritable(Centroid centroid) {
+ this.centroid = centroid;
+ }
+
+ public Centroid getCentroid() {
+ return centroid;
+ }
+
+ @Override
+ public void write(DataOutput dataOutput) throws IOException {
+ dataOutput.writeInt(centroid.getIndex());
+ dataOutput.writeDouble(centroid.getWeight());
+ VectorWritable.writeVector(dataOutput, centroid.getVector());
+ }
+
+ @Override
+ public void readFields(DataInput dataInput) throws IOException {
+ if (centroid == null) {
+ centroid = read(dataInput);
+ return;
+ }
+ centroid.setIndex(dataInput.readInt());
+ centroid.setWeight(dataInput.readDouble());
+ centroid.assign(VectorWritable.readVector(dataInput));
+ }
+
+ public static Centroid read(DataInput dataInput) throws IOException {
+ int index = dataInput.readInt();
+ double weight = dataInput.readDouble();
+ Vector v = VectorWritable.readVector(dataInput);
+ return new Centroid(index, v, weight);
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) {
+ return true;
+ }
+ if (!(o instanceof CentroidWritable)) {
+ return false;
+ }
+ CentroidWritable writable = (CentroidWritable) o;
+ return centroid.equals(writable.centroid);
+ }
+
+ @Override
+ public int hashCode() {
+ return centroid.hashCode();
+ }
+
+ @Override
+ public String toString() {
+ return centroid.toString();
+ }
+}
r***@apache.org
2018-06-28 14:54:36 UTC
Permalink
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/canopy/CanopyMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/canopy/CanopyMapper.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/canopy/CanopyMapper.java
new file mode 100644
index 0000000..265d3da
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/canopy/CanopyMapper.java
@@ -0,0 +1,66 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.canopy;
+
+import java.io.IOException;
+import java.util.Collection;
+
+import com.google.common.collect.Lists;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.WritableComparable;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.math.VectorWritable;
+
+@Deprecated
+class CanopyMapper extends
+ Mapper<WritableComparable<?>, VectorWritable, Text, VectorWritable> {
+
+ private final Collection<Canopy> canopies = Lists.newArrayList();
+
+ private CanopyClusterer canopyClusterer;
+
+ private int clusterFilter;
+
+ @Override
+ protected void map(WritableComparable<?> key, VectorWritable point,
+ Context context) throws IOException, InterruptedException {
+ canopyClusterer.addPointToCanopies(point.get(), canopies);
+ }
+
+ @Override
+ protected void setup(Context context) throws IOException,
+ InterruptedException {
+ super.setup(context);
+ canopyClusterer = CanopyConfigKeys.configureCanopyClusterer(context.getConfiguration());
+ clusterFilter = Integer.parseInt(context.getConfiguration().get(
+ CanopyConfigKeys.CF_KEY));
+ }
+
+ @Override
+ protected void cleanup(Context context) throws IOException,
+ InterruptedException {
+ for (Canopy canopy : canopies) {
+ canopy.computeParameters();
+ if (canopy.getNumObservations() > clusterFilter) {
+ context.write(new Text("centroid"), new VectorWritable(canopy
+ .getCenter()));
+ }
+ }
+ super.cleanup(context);
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/canopy/CanopyReducer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/canopy/CanopyReducer.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/canopy/CanopyReducer.java
new file mode 100644
index 0000000..cdd7d5e
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/canopy/CanopyReducer.java
@@ -0,0 +1,70 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.canopy;
+
+import java.io.IOException;
+import java.util.Collection;
+
+import com.google.common.collect.Lists;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.mahout.clustering.iterator.ClusterWritable;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+
+@Deprecated
+public class CanopyReducer extends Reducer<Text, VectorWritable, Text, ClusterWritable> {
+
+ private final Collection<Canopy> canopies = Lists.newArrayList();
+
+ private CanopyClusterer canopyClusterer;
+
+ private int clusterFilter;
+
+ CanopyClusterer getCanopyClusterer() {
+ return canopyClusterer;
+ }
+
+ @Override
+ protected void reduce(Text arg0, Iterable<VectorWritable> values,
+ Context context) throws IOException, InterruptedException {
+ for (VectorWritable value : values) {
+ Vector point = value.get();
+ canopyClusterer.addPointToCanopies(point, canopies);
+ }
+ for (Canopy canopy : canopies) {
+ canopy.computeParameters();
+ if (canopy.getNumObservations() > clusterFilter) {
+ ClusterWritable clusterWritable = new ClusterWritable();
+ clusterWritable.setValue(canopy);
+ context.write(new Text(canopy.getIdentifier()), clusterWritable);
+ }
+ }
+ }
+
+ @Override
+ protected void setup(Context context) throws IOException,
+ InterruptedException {
+ super.setup(context);
+ canopyClusterer = CanopyConfigKeys.configureCanopyClusterer(context.getConfiguration());
+ canopyClusterer.useT3T4();
+ clusterFilter = Integer.parseInt(context.getConfiguration().get(
+ CanopyConfigKeys.CF_KEY));
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationConfigKeys.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationConfigKeys.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationConfigKeys.java
new file mode 100644
index 0000000..6b88388
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationConfigKeys.java
@@ -0,0 +1,33 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.classify;
+
+/**
+ * Constants used in Cluster Classification.
+ */
+public final class ClusterClassificationConfigKeys {
+
+ public static final String CLUSTERS_IN = "clusters_in";
+
+ public static final String OUTLIER_REMOVAL_THRESHOLD = "pdf_threshold";
+
+ public static final String EMIT_MOST_LIKELY = "emit_most_likely";
+
+ private ClusterClassificationConfigKeys() {
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationDriver.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationDriver.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationDriver.java
new file mode 100644
index 0000000..ead95cf
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationDriver.java
@@ -0,0 +1,313 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.classify;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.clustering.Cluster;
+import org.apache.mahout.clustering.iterator.ClusterWritable;
+import org.apache.mahout.clustering.iterator.ClusteringPolicy;
+import org.apache.mahout.clustering.iterator.DistanceMeasureCluster;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
+import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterator;
+import org.apache.mahout.math.NamedVector;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.Vector.Element;
+import org.apache.mahout.math.VectorWritable;
+
+/**
+ * Classifies the vectors into different clusters found by the clustering
+ * algorithm.
+ */
+public final class ClusterClassificationDriver extends AbstractJob {
+
+ /**
+ * CLI to run Cluster Classification Driver.
+ */
+ @Override
+ public int run(String[] args) throws Exception {
+
+ addInputOption();
+ addOutputOption();
+ addOption(DefaultOptionCreator.methodOption().create());
+ addOption(DefaultOptionCreator.clustersInOption()
+ .withDescription("The input centroids, as Vectors. Must be a SequenceFile of Writable, Cluster/Canopy.")
+ .create());
+
+ if (parseArguments(args) == null) {
+ return -1;
+ }
+
+ Path input = getInputPath();
+ Path output = getOutputPath();
+
+ if (getConf() == null) {
+ setConf(new Configuration());
+ }
+ Path clustersIn = new Path(getOption(DefaultOptionCreator.CLUSTERS_IN_OPTION));
+ boolean runSequential = getOption(DefaultOptionCreator.METHOD_OPTION).equalsIgnoreCase(
+ DefaultOptionCreator.SEQUENTIAL_METHOD);
+
+ double clusterClassificationThreshold = 0.0;
+ if (hasOption(DefaultOptionCreator.OUTLIER_THRESHOLD)) {
+ clusterClassificationThreshold = Double.parseDouble(getOption(DefaultOptionCreator.OUTLIER_THRESHOLD));
+ }
+
+ run(getConf(), input, clustersIn, output, clusterClassificationThreshold, true, runSequential);
+
+ return 0;
+ }
+
+ /**
+ * Constructor to be used by the ToolRunner.
+ */
+ private ClusterClassificationDriver() {
+ }
+
+ public static void main(String[] args) throws Exception {
+ ToolRunner.run(new Configuration(), new ClusterClassificationDriver(), args);
+ }
+
+ /**
+ * Uses {@link ClusterClassifier} to classify input vectors into their
+ * respective clusters.
+ *
+ * @param input
+ * the input vectors
+ * @param clusteringOutputPath
+ * the output path of clustering ( it reads clusters-*-final file
+ * from here )
+ * @param output
+ * the location to store the classified vectors
+ * @param clusterClassificationThreshold
+ * the threshold value of probability distribution function from 0.0
+ * to 1.0. Any vector with pdf less that this threshold will not be
+ * classified for the cluster.
+ * @param runSequential
+ * Run the process sequentially or in a mapreduce way.
+ * @throws IOException
+ * @throws InterruptedException
+ * @throws ClassNotFoundException
+ */
+ public static void run(Configuration conf, Path input, Path clusteringOutputPath, Path output, Double clusterClassificationThreshold,
+ boolean emitMostLikely, boolean runSequential) throws IOException, InterruptedException, ClassNotFoundException {
+ if (runSequential) {
+ classifyClusterSeq(conf, input, clusteringOutputPath, output, clusterClassificationThreshold, emitMostLikely);
+ } else {
+ classifyClusterMR(conf, input, clusteringOutputPath, output, clusterClassificationThreshold, emitMostLikely);
+ }
+
+ }
+
+ private static void classifyClusterSeq(Configuration conf, Path input, Path clusters, Path output,
+ Double clusterClassificationThreshold, boolean emitMostLikely) throws IOException {
+ List<Cluster> clusterModels = populateClusterModels(clusters, conf);
+ ClusteringPolicy policy = ClusterClassifier.readPolicy(finalClustersPath(conf, clusters));
+ ClusterClassifier clusterClassifier = new ClusterClassifier(clusterModels, policy);
+ selectCluster(input, clusterModels, clusterClassifier, output, clusterClassificationThreshold, emitMostLikely);
+
+ }
+
+ /**
+ * Populates a list with clusters present in clusters-*-final directory.
+ *
+ * @param clusterOutputPath
+ * The output path of the clustering.
+ * @param conf
+ * The Hadoop Configuration
+ * @return The list of clusters found by the clustering.
+ * @throws IOException
+ */
+ private static List<Cluster> populateClusterModels(Path clusterOutputPath, Configuration conf) throws IOException {
+ List<Cluster> clusterModels = new ArrayList<>();
+ Path finalClustersPath = finalClustersPath(conf, clusterOutputPath);
+ Iterator<?> it = new SequenceFileDirValueIterator<>(finalClustersPath, PathType.LIST,
+ PathFilters.partFilter(), null, false, conf);
+ while (it.hasNext()) {
+ ClusterWritable next = (ClusterWritable) it.next();
+ Cluster cluster = next.getValue();
+ cluster.configure(conf);
+ clusterModels.add(cluster);
+ }
+ return clusterModels;
+ }
+
+ private static Path finalClustersPath(Configuration conf, Path clusterOutputPath) throws IOException {
+ FileSystem fileSystem = clusterOutputPath.getFileSystem(conf);
+ FileStatus[] clusterFiles = fileSystem.listStatus(clusterOutputPath, PathFilters.finalPartFilter());
+ return clusterFiles[0].getPath();
+ }
+
+ /**
+ * Classifies the vector into its respective cluster.
+ *
+ * @param input
+ * the path containing the input vector.
+ * @param clusterModels
+ * the clusters
+ * @param clusterClassifier
+ * used to classify the vectors into different clusters
+ * @param output
+ * the path to store classified data
+ * @param clusterClassificationThreshold
+ * the threshold value of probability distribution function from 0.0
+ * to 1.0. Any vector with pdf less that this threshold will not be
+ * classified for the cluster
+ * @param emitMostLikely
+ * emit the vectors with the max pdf values per cluster
+ * @throws IOException
+ */
+ private static void selectCluster(Path input, List<Cluster> clusterModels, ClusterClassifier clusterClassifier,
+ Path output, Double clusterClassificationThreshold, boolean emitMostLikely) throws IOException {
+ Configuration conf = new Configuration();
+ SequenceFile.Writer writer = new SequenceFile.Writer(input.getFileSystem(conf), conf, new Path(output,
+ "part-m-" + 0), IntWritable.class, WeightedPropertyVectorWritable.class);
+ for (Pair<Writable, VectorWritable> vw : new SequenceFileDirIterable<Writable, VectorWritable>(input, PathType.LIST,
+ PathFilters.logsCRCFilter(), conf)) {
+ // Converting to NamedVectors to preserve the vectorId else its not obvious as to which point
+ // belongs to which cluster - fix for MAHOUT-1410
+ Class<? extends Writable> keyClass = vw.getFirst().getClass();
+ Vector vector = vw.getSecond().get();
+ if (!keyClass.equals(NamedVector.class)) {
+ if (keyClass.equals(Text.class)) {
+ vector = new NamedVector(vector, vw.getFirst().toString());
+ } else if (keyClass.equals(IntWritable.class)) {
+ vector = new NamedVector(vector, Integer.toString(((IntWritable) vw.getFirst()).get()));
+ }
+ }
+ Vector pdfPerCluster = clusterClassifier.classify(vector);
+ if (shouldClassify(pdfPerCluster, clusterClassificationThreshold)) {
+ classifyAndWrite(clusterModels, clusterClassificationThreshold, emitMostLikely, writer, new VectorWritable(vector), pdfPerCluster);
+ }
+ }
+ writer.close();
+ }
+
+ private static void classifyAndWrite(List<Cluster> clusterModels, Double clusterClassificationThreshold,
+ boolean emitMostLikely, SequenceFile.Writer writer, VectorWritable vw, Vector pdfPerCluster) throws IOException {
+ Map<Text, Text> props = new HashMap<>();
+ if (emitMostLikely) {
+ int maxValueIndex = pdfPerCluster.maxValueIndex();
+ WeightedPropertyVectorWritable weightedPropertyVectorWritable =
+ new WeightedPropertyVectorWritable(pdfPerCluster.maxValue(), vw.get(), props);
+ write(clusterModels, writer, weightedPropertyVectorWritable, maxValueIndex);
+ } else {
+ writeAllAboveThreshold(clusterModels, clusterClassificationThreshold, writer, vw, pdfPerCluster);
+ }
+ }
+
+ private static void writeAllAboveThreshold(List<Cluster> clusterModels, Double clusterClassificationThreshold,
+ SequenceFile.Writer writer, VectorWritable vw, Vector pdfPerCluster) throws IOException {
+ Map<Text, Text> props = new HashMap<>();
+ for (Element pdf : pdfPerCluster.nonZeroes()) {
+ if (pdf.get() >= clusterClassificationThreshold) {
+ WeightedPropertyVectorWritable wvw = new WeightedPropertyVectorWritable(pdf.get(), vw.get(), props);
+ int clusterIndex = pdf.index();
+ write(clusterModels, writer, wvw, clusterIndex);
+ }
+ }
+ }
+
+ private static void write(List<Cluster> clusterModels, SequenceFile.Writer writer,
+ WeightedPropertyVectorWritable weightedPropertyVectorWritable,
+ int maxValueIndex) throws IOException {
+ Cluster cluster = clusterModels.get(maxValueIndex);
+
+ DistanceMeasureCluster distanceMeasureCluster = (DistanceMeasureCluster) cluster;
+ DistanceMeasure distanceMeasure = distanceMeasureCluster.getMeasure();
+ double distance = distanceMeasure.distance(cluster.getCenter(), weightedPropertyVectorWritable.getVector());
+
+ weightedPropertyVectorWritable.getProperties().put(new Text("distance"), new Text(Double.toString(distance)));
+ writer.append(new IntWritable(cluster.getId()), weightedPropertyVectorWritable);
+ }
+
+ /**
+ * Decides whether the vector should be classified or not based on the max pdf
+ * value of the clusters and threshold value.
+ *
+ * @return whether the vector should be classified or not.
+ */
+ private static boolean shouldClassify(Vector pdfPerCluster, Double clusterClassificationThreshold) {
+ return pdfPerCluster.maxValue() >= clusterClassificationThreshold;
+ }
+
+ private static void classifyClusterMR(Configuration conf, Path input, Path clustersIn, Path output,
+ Double clusterClassificationThreshold, boolean emitMostLikely) throws IOException, InterruptedException,
+ ClassNotFoundException {
+
+ conf.setFloat(ClusterClassificationConfigKeys.OUTLIER_REMOVAL_THRESHOLD,
+ clusterClassificationThreshold.floatValue());
+ conf.setBoolean(ClusterClassificationConfigKeys.EMIT_MOST_LIKELY, emitMostLikely);
+ conf.set(ClusterClassificationConfigKeys.CLUSTERS_IN, clustersIn.toUri().toString());
+
+ Job job = new Job(conf, "Cluster Classification Driver running over input: " + input);
+ job.setJarByClass(ClusterClassificationDriver.class);
+
+ job.setInputFormatClass(SequenceFileInputFormat.class);
+ job.setOutputFormatClass(SequenceFileOutputFormat.class);
+
+ job.setMapperClass(ClusterClassificationMapper.class);
+ job.setNumReduceTasks(0);
+
+ job.setOutputKeyClass(IntWritable.class);
+ job.setOutputValueClass(WeightedPropertyVectorWritable.class);
+
+ FileInputFormat.addInputPath(job, input);
+ FileOutputFormat.setOutputPath(job, output);
+ if (!job.waitForCompletion(true)) {
+ throw new InterruptedException("Cluster Classification Driver Job failed processing " + input);
+ }
+ }
+
+ public static void run(Configuration conf, Path input, Path clusteringOutputPath, Path output,
+ double clusterClassificationThreshold, boolean emitMostLikely, boolean runSequential) throws IOException,
+ InterruptedException, ClassNotFoundException {
+ if (runSequential) {
+ classifyClusterSeq(conf, input, clusteringOutputPath, output, clusterClassificationThreshold, emitMostLikely);
+ } else {
+ classifyClusterMR(conf, input, clusteringOutputPath, output, clusterClassificationThreshold, emitMostLikely);
+ }
+
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationMapper.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationMapper.java
new file mode 100644
index 0000000..fffa7f9
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/classify/ClusterClassificationMapper.java
@@ -0,0 +1,161 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.classify;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.io.WritableComparable;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.clustering.Cluster;
+import org.apache.mahout.clustering.iterator.ClusterWritable;
+import org.apache.mahout.clustering.iterator.ClusteringPolicy;
+import org.apache.mahout.clustering.iterator.DistanceMeasureCluster;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
+import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterator;
+import org.apache.mahout.math.NamedVector;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.Vector.Element;
+import org.apache.mahout.math.VectorWritable;
+
+/**
+ * Mapper for classifying vectors into clusters.
+ */
+public class ClusterClassificationMapper extends
+ Mapper<WritableComparable<?>,VectorWritable,IntWritable,WeightedVectorWritable> {
+
+ private double threshold;
+ private List<Cluster> clusterModels;
+ private ClusterClassifier clusterClassifier;
+ private IntWritable clusterId;
+ private boolean emitMostLikely;
+
+ @Override
+ protected void setup(Context context) throws IOException, InterruptedException {
+ super.setup(context);
+
+ Configuration conf = context.getConfiguration();
+ String clustersIn = conf.get(ClusterClassificationConfigKeys.CLUSTERS_IN);
+ threshold = conf.getFloat(ClusterClassificationConfigKeys.OUTLIER_REMOVAL_THRESHOLD, 0.0f);
+ emitMostLikely = conf.getBoolean(ClusterClassificationConfigKeys.EMIT_MOST_LIKELY, false);
+
+ clusterModels = new ArrayList<>();
+
+ if (clustersIn != null && !clustersIn.isEmpty()) {
+ Path clustersInPath = new Path(clustersIn);
+ clusterModels = populateClusterModels(clustersInPath, conf);
+ ClusteringPolicy policy = ClusterClassifier
+ .readPolicy(finalClustersPath(clustersInPath));
+ clusterClassifier = new ClusterClassifier(clusterModels, policy);
+ }
+ clusterId = new IntWritable();
+ }
+
+ /**
+ * Mapper which classifies the vectors to respective clusters.
+ */
+ @Override
+ protected void map(WritableComparable<?> key, VectorWritable vw, Context context)
+ throws IOException, InterruptedException {
+ if (!clusterModels.isEmpty()) {
+ // Converting to NamedVectors to preserve the vectorId else its not obvious as to which point
+ // belongs to which cluster - fix for MAHOUT-1410
+ Class<? extends Vector> vectorClass = vw.get().getClass();
+ Vector vector = vw.get();
+ if (!vectorClass.equals(NamedVector.class)) {
+ if (key.getClass().equals(Text.class)) {
+ vector = new NamedVector(vector, key.toString());
+ } else if (key.getClass().equals(IntWritable.class)) {
+ vector = new NamedVector(vector, Integer.toString(((IntWritable) key).get()));
+ }
+ }
+ Vector pdfPerCluster = clusterClassifier.classify(vector);
+ if (shouldClassify(pdfPerCluster)) {
+ if (emitMostLikely) {
+ int maxValueIndex = pdfPerCluster.maxValueIndex();
+ write(new VectorWritable(vector), context, maxValueIndex, 1.0);
+ } else {
+ writeAllAboveThreshold(new VectorWritable(vector), context, pdfPerCluster);
+ }
+ }
+ }
+ }
+
+ private void writeAllAboveThreshold(VectorWritable vw, Context context,
+ Vector pdfPerCluster) throws IOException, InterruptedException {
+ for (Element pdf : pdfPerCluster.nonZeroes()) {
+ if (pdf.get() >= threshold) {
+ int clusterIndex = pdf.index();
+ write(vw, context, clusterIndex, pdf.get());
+ }
+ }
+ }
+
+ private void write(VectorWritable vw, Context context, int clusterIndex, double weight)
+ throws IOException, InterruptedException {
+ Cluster cluster = clusterModels.get(clusterIndex);
+ clusterId.set(cluster.getId());
+
+ DistanceMeasureCluster distanceMeasureCluster = (DistanceMeasureCluster) cluster;
+ DistanceMeasure distanceMeasure = distanceMeasureCluster.getMeasure();
+ double distance = distanceMeasure.distance(cluster.getCenter(), vw.get());
+
+ Map<Text, Text> props = new HashMap<>();
+ props.put(new Text("distance"), new Text(Double.toString(distance)));
+ context.write(clusterId, new WeightedPropertyVectorWritable(weight, vw.get(), props));
+ }
+
+ public static List<Cluster> populateClusterModels(Path clusterOutputPath, Configuration conf) throws IOException {
+ List<Cluster> clusters = new ArrayList<>();
+ FileSystem fileSystem = clusterOutputPath.getFileSystem(conf);
+ FileStatus[] clusterFiles = fileSystem.listStatus(clusterOutputPath, PathFilters.finalPartFilter());
+ Iterator<?> it = new SequenceFileDirValueIterator<>(
+ clusterFiles[0].getPath(), PathType.LIST, PathFilters.partFilter(),
+ null, false, conf);
+ while (it.hasNext()) {
+ ClusterWritable next = (ClusterWritable) it.next();
+ Cluster cluster = next.getValue();
+ cluster.configure(conf);
+ clusters.add(cluster);
+ }
+ return clusters;
+ }
+
+ private boolean shouldClassify(Vector pdfPerCluster) {
+ return pdfPerCluster.maxValue() >= threshold;
+ }
+
+ private static Path finalClustersPath(Path clusterOutputPath) throws IOException {
+ FileSystem fileSystem = clusterOutputPath.getFileSystem(new Configuration());
+ FileStatus[] clusterFiles = fileSystem.listStatus(clusterOutputPath, PathFilters.finalPartFilter());
+ return clusterFiles[0].getPath();
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/classify/ClusterClassifier.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/classify/ClusterClassifier.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/classify/ClusterClassifier.java
new file mode 100644
index 0000000..dcd4062
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/classify/ClusterClassifier.java
@@ -0,0 +1,231 @@
+/* Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.clustering.classify;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Locale;
+
+import com.google.common.io.Closeables;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.classifier.AbstractVectorClassifier;
+import org.apache.mahout.classifier.OnlineLearner;
+import org.apache.mahout.clustering.Cluster;
+import org.apache.mahout.clustering.iterator.ClusterWritable;
+import org.apache.mahout.clustering.iterator.ClusteringPolicy;
+import org.apache.mahout.clustering.iterator.ClusteringPolicyWritable;
+import org.apache.mahout.common.ClassUtils;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
+import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterable;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+
+/**
+ * This classifier works with any ClusteringPolicy and its associated Clusters.
+ * It is initialized with a policy and a list of compatible clusters and
+ * thereafter it can classify any new Vector into one or more of the clusters
+ * based upon the pdf() function which each cluster supports.
+ * <p/>
+ * In addition, it is an OnlineLearner and can be trained. Training amounts to
+ * asking the actual model to observe the vector and closing the classifier
+ * causes all the models to computeParameters.
+ * <p/>
+ * Because a ClusterClassifier implements Writable, it can be written-to and
+ * read-from a sequence file as a single entity. For sequential and MapReduce
+ * clustering in conjunction with a ClusterIterator; however, it utilizes an
+ * exploded file format. In this format, the iterator writes the policy to a
+ * single POLICY_FILE_NAME file in the clustersOut directory and the models are
+ * written to one or more part-n files so that multiple reducers may employed to
+ * produce them.
+ */
+public class ClusterClassifier extends AbstractVectorClassifier implements OnlineLearner, Writable {
+
+ private static final String POLICY_FILE_NAME = "_policy";
+
+ private List<Cluster> models;
+
+ private String modelClass;
+
+ private ClusteringPolicy policy;
+
+ /**
+ * The public constructor accepts a list of clusters to become the models
+ *
+ * @param models a List<Cluster>
+ * @param policy a ClusteringPolicy
+ */
+ public ClusterClassifier(List<Cluster> models, ClusteringPolicy policy) {
+ this.models = models;
+ modelClass = models.get(0).getClass().getName();
+ this.policy = policy;
+ }
+
+ // needed for serialization/De-serialization
+ public ClusterClassifier() {
+ }
+
+ // only used by MR ClusterIterator
+ protected ClusterClassifier(ClusteringPolicy policy) {
+ this.policy = policy;
+ }
+
+ @Override
+ public Vector classify(Vector instance) {
+ return policy.classify(instance, this);
+ }
+
+ @Override
+ public double classifyScalar(Vector instance) {
+ if (models.size() == 2) {
+ double pdf0 = models.get(0).pdf(new VectorWritable(instance));
+ double pdf1 = models.get(1).pdf(new VectorWritable(instance));
+ return pdf0 / (pdf0 + pdf1);
+ }
+ throw new IllegalStateException();
+ }
+
+ @Override
+ public int numCategories() {
+ return models.size();
+ }
+
+ @Override
+ public void write(DataOutput out) throws IOException {
+ out.writeInt(models.size());
+ out.writeUTF(modelClass);
+ new ClusteringPolicyWritable(policy).write(out);
+ for (Cluster cluster : models) {
+ cluster.write(out);
+ }
+ }
+
+ @Override
+ public void readFields(DataInput in) throws IOException {
+ int size = in.readInt();
+ modelClass = in.readUTF();
+ models = new ArrayList<>();
+ ClusteringPolicyWritable clusteringPolicyWritable = new ClusteringPolicyWritable();
+ clusteringPolicyWritable.readFields(in);
+ policy = clusteringPolicyWritable.getValue();
+ for (int i = 0; i < size; i++) {
+ Cluster element = ClassUtils.instantiateAs(modelClass, Cluster.class);
+ element.readFields(in);
+ models.add(element);
+ }
+ }
+
+ @Override
+ public void train(int actual, Vector instance) {
+ models.get(actual).observe(new VectorWritable(instance));
+ }
+
+ /**
+ * Train the models given an additional weight. Unique to ClusterClassifier
+ *
+ * @param actual the int index of a model
+ * @param data a data Vector
+ * @param weight a double weighting factor
+ */
+ public void train(int actual, Vector data, double weight) {
+ models.get(actual).observe(new VectorWritable(data), weight);
+ }
+
+ @Override
+ public void train(long trackingKey, String groupKey, int actual, Vector instance) {
+ models.get(actual).observe(new VectorWritable(instance));
+ }
+
+ @Override
+ public void train(long trackingKey, int actual, Vector instance) {
+ models.get(actual).observe(new VectorWritable(instance));
+ }
+
+ @Override
+ public void close() {
+ policy.close(this);
+ }
+
+ public List<Cluster> getModels() {
+ return models;
+ }
+
+ public ClusteringPolicy getPolicy() {
+ return policy;
+ }
+
+ public void writeToSeqFiles(Path path) throws IOException {
+ writePolicy(policy, path);
+ Configuration config = new Configuration();
+ FileSystem fs = FileSystem.get(path.toUri(), config);
+ ClusterWritable cw = new ClusterWritable();
+ for (int i = 0; i < models.size(); i++) {
+ try (SequenceFile.Writer writer = new SequenceFile.Writer(fs, config,
+ new Path(path, "part-" + String.format(Locale.ENGLISH, "%05d", i)), IntWritable.class,
+ ClusterWritable.class)) {
+ Cluster cluster = models.get(i);
+ cw.setValue(cluster);
+ Writable key = new IntWritable(i);
+ writer.append(key, cw);
+ }
+ }
+ }
+
+ public void readFromSeqFiles(Configuration conf, Path path) throws IOException {
+ Configuration config = new Configuration();
+ List<Cluster> clusters = new ArrayList<>();
+ for (ClusterWritable cw : new SequenceFileDirValueIterable<ClusterWritable>(path, PathType.LIST,
+ PathFilters.logsCRCFilter(), config)) {
+ Cluster cluster = cw.getValue();
+ cluster.configure(conf);
+ clusters.add(cluster);
+ }
+ this.models = clusters;
+ modelClass = models.get(0).getClass().getName();
+ this.policy = readPolicy(path);
+ }
+
+ public static ClusteringPolicy readPolicy(Path path) throws IOException {
+ Path policyPath = new Path(path, POLICY_FILE_NAME);
+ Configuration config = new Configuration();
+ FileSystem fs = FileSystem.get(policyPath.toUri(), config);
+ SequenceFile.Reader reader = new SequenceFile.Reader(fs, policyPath, config);
+ Text key = new Text();
+ ClusteringPolicyWritable cpw = new ClusteringPolicyWritable();
+ reader.next(key, cpw);
+ Closeables.close(reader, true);
+ return cpw.getValue();
+ }
+
+ public static void writePolicy(ClusteringPolicy policy, Path path) throws IOException {
+ Path policyPath = new Path(path, POLICY_FILE_NAME);
+ Configuration config = new Configuration();
+ FileSystem fs = FileSystem.get(policyPath.toUri(), config);
+ SequenceFile.Writer writer = new SequenceFile.Writer(fs, config, policyPath, Text.class,
+ ClusteringPolicyWritable.class);
+ writer.append(new Text(), new ClusteringPolicyWritable(policy));
+ Closeables.close(writer, false);
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/classify/WeightedPropertyVectorWritable.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/classify/WeightedPropertyVectorWritable.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/classify/WeightedPropertyVectorWritable.java
new file mode 100644
index 0000000..567659b
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/classify/WeightedPropertyVectorWritable.java
@@ -0,0 +1,95 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.classify;
+
+import org.apache.hadoop.io.Text;
+import org.apache.mahout.clustering.AbstractCluster;
+import org.apache.mahout.math.Vector;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+
+public class WeightedPropertyVectorWritable extends WeightedVectorWritable {
+
+ private Map<Text, Text> properties;
+
+ public WeightedPropertyVectorWritable() {
+ }
+
+ public WeightedPropertyVectorWritable(Map<Text, Text> properties) {
+ this.properties = properties;
+ }
+
+ public WeightedPropertyVectorWritable(double weight, Vector vector, Map<Text, Text> properties) {
+ super(weight, vector);
+ this.properties = properties;
+ }
+
+ public Map<Text, Text> getProperties() {
+ return properties;
+ }
+
+ public void setProperties(Map<Text, Text> properties) {
+ this.properties = properties;
+ }
+
+ @Override
+ public void readFields(DataInput in) throws IOException {
+ super.readFields(in);
+ int size = in.readInt();
+ if (size > 0) {
+ properties = new HashMap<>();
+ for (int i = 0; i < size; i++) {
+ Text key = new Text(in.readUTF());
+ Text val = new Text(in.readUTF());
+ properties.put(key, val);
+ }
+ }
+ }
+
+ @Override
+ public void write(DataOutput out) throws IOException {
+ super.write(out);
+ out.writeInt(properties != null ? properties.size() : 0);
+ if (properties != null) {
+ for (Map.Entry<Text, Text> entry : properties.entrySet()) {
+ out.writeUTF(entry.getKey().toString());
+ out.writeUTF(entry.getValue().toString());
+ }
+ }
+ }
+
+ @Override
+ public String toString() {
+ Vector vector = getVector();
+ StringBuilder bldr = new StringBuilder("wt: ").append(getWeight()).append(' ');
+ if (properties != null && !properties.isEmpty()) {
+ for (Map.Entry<Text, Text> entry : properties.entrySet()) {
+ bldr.append(entry.getKey().toString()).append(": ").append(entry.getValue().toString()).append(' ');
+ }
+ }
+ bldr.append(" vec: ").append(vector == null ? "null" : AbstractCluster.formatVector(vector, null));
+ return bldr.toString();
+ }
+
+
+}
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/classify/WeightedVectorWritable.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/classify/WeightedVectorWritable.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/classify/WeightedVectorWritable.java
new file mode 100644
index 0000000..510dd39
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/classify/WeightedVectorWritable.java
@@ -0,0 +1,72 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.classify;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.clustering.AbstractCluster;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+
+public class WeightedVectorWritable implements Writable {
+
+ private final VectorWritable vectorWritable = new VectorWritable();
+ private double weight;
+
+ public WeightedVectorWritable() {
+ }
+
+ public WeightedVectorWritable(double weight, Vector vector) {
+ this.vectorWritable.set(vector);
+ this.weight = weight;
+ }
+
+ public Vector getVector() {
+ return vectorWritable.get();
+ }
+
+ public void setVector(Vector vector) {
+ vectorWritable.set(vector);
+ }
+
+ public double getWeight() {
+ return weight;
+ }
+
+ @Override
+ public void readFields(DataInput in) throws IOException {
+ vectorWritable.readFields(in);
+ weight = in.readDouble();
+ }
+
+ @Override
+ public void write(DataOutput out) throws IOException {
+ vectorWritable.write(out);
+ out.writeDouble(weight);
+ }
+
+ @Override
+ public String toString() {
+ Vector vector = vectorWritable.get();
+ return weight + ": " + (vector == null ? "null" : AbstractCluster.formatVector(vector, null));
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansClusterer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansClusterer.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansClusterer.java
new file mode 100644
index 0000000..ff02a4c
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansClusterer.java
@@ -0,0 +1,59 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.fuzzykmeans;
+
+import java.util.Collection;
+import java.util.List;
+
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.Vector;
+
+public class FuzzyKMeansClusterer {
+
+ private static final double MINIMAL_VALUE = 0.0000000001;
+
+ private double m = 2.0; // default value
+
+ public Vector computePi(Collection<SoftCluster> clusters, List<Double> clusterDistanceList) {
+ Vector pi = new DenseVector(clusters.size());
+ for (int i = 0; i < clusters.size(); i++) {
+ double probWeight = computeProbWeight(clusterDistanceList.get(i), clusterDistanceList);
+ pi.set(i, probWeight);
+ }
+ return pi;
+ }
+
+ /** Computes the probability of a point belonging to a cluster */
+ public double computeProbWeight(double clusterDistance, Iterable<Double> clusterDistanceList) {
+ if (clusterDistance == 0) {
+ clusterDistance = MINIMAL_VALUE;
+ }
+ double denom = 0.0;
+ for (double eachCDist : clusterDistanceList) {
+ if (eachCDist == 0.0) {
+ eachCDist = MINIMAL_VALUE;
+ }
+ denom += Math.pow(clusterDistance / eachCDist, 2.0 / (m - 1));
+ }
+ return 1.0 / denom;
+ }
+
+ public void setM(double m) {
+ this.m = m;
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java
new file mode 100644
index 0000000..98eb944
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java
@@ -0,0 +1,324 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.fuzzykmeans;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.clustering.Cluster;
+import org.apache.mahout.clustering.classify.ClusterClassificationDriver;
+import org.apache.mahout.clustering.classify.ClusterClassifier;
+import org.apache.mahout.clustering.iterator.ClusterIterator;
+import org.apache.mahout.clustering.iterator.ClusteringPolicy;
+import org.apache.mahout.clustering.iterator.FuzzyKMeansClusteringPolicy;
+import org.apache.mahout.clustering.kmeans.RandomSeedGenerator;
+import org.apache.mahout.clustering.topdown.PathDirectory;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.ClassUtils;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class FuzzyKMeansDriver extends AbstractJob {
+
+ public static final String M_OPTION = "m";
+
+ private static final Logger log = LoggerFactory.getLogger(FuzzyKMeansDriver.class);
+
+ public static void main(String[] args) throws Exception {
+ ToolRunner.run(new Configuration(), new FuzzyKMeansDriver(), args);
+ }
+
+ @Override
+ public int run(String[] args) throws Exception {
+
+ addInputOption();
+ addOutputOption();
+ addOption(DefaultOptionCreator.distanceMeasureOption().create());
+ addOption(DefaultOptionCreator.clustersInOption()
+ .withDescription("The input centroids, as Vectors. Must be a SequenceFile of Writable, Cluster/Canopy. "
+ + "If k is also specified, then a random set of vectors will be selected"
+ + " and written out to this path first")
+ .create());
+ addOption(DefaultOptionCreator.numClustersOption()
+ .withDescription("The k in k-Means. If specified, then a random selection of k Vectors will be chosen"
+ + " as the Centroid and written to the clusters input path.").create());
+ addOption(DefaultOptionCreator.convergenceOption().create());
+ addOption(DefaultOptionCreator.maxIterationsOption().create());
+ addOption(DefaultOptionCreator.overwriteOption().create());
+ addOption(M_OPTION, M_OPTION, "coefficient normalization factor, must be greater than 1", true);
+ addOption(DefaultOptionCreator.clusteringOption().create());
+ addOption(DefaultOptionCreator.emitMostLikelyOption().create());
+ addOption(DefaultOptionCreator.thresholdOption().create());
+ addOption(DefaultOptionCreator.methodOption().create());
+ addOption(DefaultOptionCreator.useSetRandomSeedOption().create());
+
+ if (parseArguments(args) == null) {
+ return -1;
+ }
+
+ Path input = getInputPath();
+ Path clusters = new Path(getOption(DefaultOptionCreator.CLUSTERS_IN_OPTION));
+ Path output = getOutputPath();
+ String measureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION);
+ if (measureClass == null) {
+ measureClass = SquaredEuclideanDistanceMeasure.class.getName();
+ }
+ double convergenceDelta = Double.parseDouble(getOption(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION));
+ float fuzziness = Float.parseFloat(getOption(M_OPTION));
+
+ int maxIterations = Integer.parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION));
+ if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
+ HadoopUtil.delete(getConf(), output);
+ }
+ boolean emitMostLikely = Boolean.parseBoolean(getOption(DefaultOptionCreator.EMIT_MOST_LIKELY_OPTION));
+ double threshold = Double.parseDouble(getOption(DefaultOptionCreator.THRESHOLD_OPTION));
+ DistanceMeasure measure = ClassUtils.instantiateAs(measureClass, DistanceMeasure.class);
+
+ if (hasOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION)) {
+ int numClusters = Integer.parseInt(getOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION));
+
+ Long seed = null;
+ if (hasOption(DefaultOptionCreator.RANDOM_SEED)) {
+ seed = Long.parseLong(getOption(DefaultOptionCreator.RANDOM_SEED));
+ }
+
+ clusters = RandomSeedGenerator.buildRandom(getConf(), input, clusters, numClusters, measure, seed);
+ }
+
+ boolean runClustering = hasOption(DefaultOptionCreator.CLUSTERING_OPTION);
+ boolean runSequential = getOption(DefaultOptionCreator.METHOD_OPTION).equalsIgnoreCase(
+ DefaultOptionCreator.SEQUENTIAL_METHOD);
+
+ run(getConf(),
+ input,
+ clusters,
+ output,
+ convergenceDelta,
+ maxIterations,
+ fuzziness,
+ runClustering,
+ emitMostLikely,
+ threshold,
+ runSequential);
+ return 0;
+ }
+
+ /**
+ * Iterate over the input vectors to produce clusters and, if requested, use the
+ * results of the final iteration to cluster the input vectors.
+ *
+ * @param input
+ * the directory pathname for input points
+ * @param clustersIn
+ * the directory pathname for initial & computed clusters
+ * @param output
+ * the directory pathname for output points
+ * @param convergenceDelta
+* the convergence delta value
+ * @param maxIterations
+* the maximum number of iterations
+ * @param m
+* the fuzzification factor, see
+* http://en.wikipedia.org/wiki/Data_clustering#Fuzzy_c-means_clustering
+ * @param runClustering
+* true if points are to be clustered after iterations complete
+ * @param emitMostLikely
+* a boolean if true emit only most likely cluster for each point
+ * @param threshold
+* a double threshold value emits all clusters having greater pdf (emitMostLikely = false)
+ * @param runSequential if true run in sequential execution mode
+ */
+ public static void run(Path input,
+ Path clustersIn,
+ Path output,
+ double convergenceDelta,
+ int maxIterations,
+ float m,
+ boolean runClustering,
+ boolean emitMostLikely,
+ double threshold,
+ boolean runSequential) throws IOException, ClassNotFoundException, InterruptedException {
+ Configuration conf = new Configuration();
+ Path clustersOut = buildClusters(conf,
+ input,
+ clustersIn,
+ output,
+ convergenceDelta,
+ maxIterations,
+ m,
+ runSequential);
+ if (runClustering) {
+ log.info("Clustering ");
+ clusterData(conf, input,
+ clustersOut,
+ output,
+ convergenceDelta,
+ m,
+ emitMostLikely,
+ threshold,
+ runSequential);
+ }
+ }
+
+ /**
+ * Iterate over the input vectors to produce clusters and, if requested, use the
+ * results of the final iteration to cluster the input vectors.
+ * @param input
+ * the directory pathname for input points
+ * @param clustersIn
+ * the directory pathname for initial & computed clusters
+ * @param output
+ * the directory pathname for output points
+ * @param convergenceDelta
+* the convergence delta value
+ * @param maxIterations
+* the maximum number of iterations
+ * @param m
+* the fuzzification factor, see
+* http://en.wikipedia.org/wiki/Data_clustering#Fuzzy_c-means_clustering
+ * @param runClustering
+* true if points are to be clustered after iterations complete
+ * @param emitMostLikely
+* a boolean if true emit only most likely cluster for each point
+ * @param threshold
+* a double threshold value emits all clusters having greater pdf (emitMostLikely = false)
+ * @param runSequential if true run in sequential execution mode
+ */
+ public static void run(Configuration conf,
+ Path input,
+ Path clustersIn,
+ Path output,
+ double convergenceDelta,
+ int maxIterations,
+ float m,
+ boolean runClustering,
+ boolean emitMostLikely,
+ double threshold,
+ boolean runSequential)
+ throws IOException, ClassNotFoundException, InterruptedException {
+ Path clustersOut =
+ buildClusters(conf, input, clustersIn, output, convergenceDelta, maxIterations, m, runSequential);
+ if (runClustering) {
+ log.info("Clustering");
+ clusterData(conf,
+ input,
+ clustersOut,
+ output,
+ convergenceDelta,
+ m,
+ emitMostLikely,
+ threshold,
+ runSequential);
+ }
+ }
+
+ /**
+ * Iterate over the input vectors to produce cluster directories for each iteration
+ *
+ * @param input
+ * the directory pathname for input points
+ * @param clustersIn
+ * the file pathname for initial cluster centers
+ * @param output
+ * the directory pathname for output points
+ * @param convergenceDelta
+ * the convergence delta value
+ * @param maxIterations
+ * the maximum number of iterations
+ * @param m
+ * the fuzzification factor, see
+ * http://en.wikipedia.org/wiki/Data_clustering#Fuzzy_c-means_clustering
+ * @param runSequential if true run in sequential execution mode
+ *
+ * @return the Path of the final clusters directory
+ */
+ public static Path buildClusters(Configuration conf,
+ Path input,
+ Path clustersIn,
+ Path output,
+ double convergenceDelta,
+ int maxIterations,
+ float m,
+ boolean runSequential)
+ throws IOException, InterruptedException, ClassNotFoundException {
+
+ List<Cluster> clusters = new ArrayList<>();
+ FuzzyKMeansUtil.configureWithClusterInfo(conf, clustersIn, clusters);
+
+ if (conf == null) {
+ conf = new Configuration();
+ }
+
+ if (clusters.isEmpty()) {
+ throw new IllegalStateException("No input clusters found in " + clustersIn + ". Check your -c argument.");
+ }
+
+ Path priorClustersPath = new Path(output, Cluster.INITIAL_CLUSTERS_DIR);
+ ClusteringPolicy policy = new FuzzyKMeansClusteringPolicy(m, convergenceDelta);
+ ClusterClassifier prior = new ClusterClassifier(clusters, policy);
+ prior.writeToSeqFiles(priorClustersPath);
+
+ if (runSequential) {
+ ClusterIterator.iterateSeq(conf, input, priorClustersPath, output, maxIterations);
+ } else {
+ ClusterIterator.iterateMR(conf, input, priorClustersPath, output, maxIterations);
+ }
+ return output;
+ }
+
+ /**
+ * Run the job using supplied arguments
+ *
+ * @param input
+ * the directory pathname for input points
+ * @param clustersIn
+ * the directory pathname for input clusters
+ * @param output
+ * the directory pathname for output points
+ * @param convergenceDelta
+* the convergence delta value
+ * @param emitMostLikely
+* a boolean if true emit only most likely cluster for each point
+ * @param threshold
+* a double threshold value emits all clusters having greater pdf (emitMostLikely = false)
+ * @param runSequential if true run in sequential execution mode
+ */
+ public static void clusterData(Configuration conf,
+ Path input,
+ Path clustersIn,
+ Path output,
+ double convergenceDelta,
+ float m,
+ boolean emitMostLikely,
+ double threshold,
+ boolean runSequential)
+ throws IOException, ClassNotFoundException, InterruptedException {
+
+ ClusterClassifier.writePolicy(new FuzzyKMeansClusteringPolicy(m, convergenceDelta), clustersIn);
+ ClusterClassificationDriver.run(conf, input, output, new Path(output, PathDirectory.CLUSTERED_POINTS_DIRECTORY),
+ threshold, emitMostLikely, runSequential);
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansUtil.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansUtil.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansUtil.java
new file mode 100644
index 0000000..25621bb
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansUtil.java
@@ -0,0 +1,76 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.fuzzykmeans;
+
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.clustering.Cluster;
+import org.apache.mahout.clustering.canopy.Canopy;
+import org.apache.mahout.clustering.iterator.ClusterWritable;
+import org.apache.mahout.clustering.kmeans.Kluster;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
+import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterable;
+
+final class FuzzyKMeansUtil {
+
+ private FuzzyKMeansUtil() {}
+
+ /**
+ * Create a list of SoftClusters from whatever type is passed in as the prior
+ *
+ * @param conf
+ * the Configuration
+ * @param clusterPath
+ * the path to the prior Clusters
+ * @param clusters
+ * a List<Cluster> to put values into
+ */
+ public static void configureWithClusterInfo(Configuration conf, Path clusterPath, List<Cluster> clusters) {
+ for (Writable value : new SequenceFileDirValueIterable<>(clusterPath, PathType.LIST,
+ PathFilters.partFilter(), conf)) {
+ Class<? extends Writable> valueClass = value.getClass();
+
+ if (valueClass.equals(ClusterWritable.class)) {
+ ClusterWritable clusterWritable = (ClusterWritable) value;
+ value = clusterWritable.getValue();
+ valueClass = value.getClass();
+ }
+
+ if (valueClass.equals(Kluster.class)) {
+ // get the cluster info
+ Kluster cluster = (Kluster) value;
+ clusters.add(new SoftCluster(cluster.getCenter(), cluster.getId(), cluster.getMeasure()));
+ } else if (valueClass.equals(SoftCluster.class)) {
+ // get the cluster info
+ clusters.add((SoftCluster) value);
+ } else if (valueClass.equals(Canopy.class)) {
+ // get the cluster info
+ Canopy canopy = (Canopy) value;
+ clusters.add(new SoftCluster(canopy.getCenter(), canopy.getId(), canopy.getMeasure()));
+ } else {
+ throw new IllegalStateException("Bad value class: " + valueClass);
+ }
+ }
+
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/fuzzykmeans/SoftCluster.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/fuzzykmeans/SoftCluster.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/fuzzykmeans/SoftCluster.java
new file mode 100644
index 0000000..52fd764
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/fuzzykmeans/SoftCluster.java
@@ -0,0 +1,60 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.fuzzykmeans;
+
+import org.apache.mahout.clustering.kmeans.Kluster;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+
+public class SoftCluster extends Kluster {
+
+ // For Writable
+ public SoftCluster() {}
+
+ /**
+ * Construct a new SoftCluster with the given point as its center
+ *
+ * @param center
+ * the center point
+ * @param measure
+ * the DistanceMeasure
+ */
+ public SoftCluster(Vector center, int clusterId, DistanceMeasure measure) {
+ super(center, clusterId, measure);
+ }
+
+ @Override
+ public String asFormatString() {
+ return this.getIdentifier() + ": "
+ + this.computeCentroid().asFormatString();
+ }
+
+ @Override
+ public String getIdentifier() {
+ return (isConverged() ? "SV-" : "SC-") + getId();
+ }
+
+ @Override
+ public double pdf(VectorWritable vw) {
+ // SoftCluster pdf cannot be calculated out of context. See
+ // FuzzyKMeansClusterer
+ throw new UnsupportedOperationException(
+ "SoftCluster pdf cannot be calculated out of context. See FuzzyKMeansClusterer");
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/AbstractClusteringPolicy.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/AbstractClusteringPolicy.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/AbstractClusteringPolicy.java
new file mode 100644
index 0000000..07cc7e3
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/AbstractClusteringPolicy.java
@@ -0,0 +1,72 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.clustering.iterator;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.List;
+
+import org.apache.mahout.clustering.Cluster;
+import org.apache.mahout.clustering.classify.ClusterClassifier;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.SequentialAccessSparseVector;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+import org.apache.mahout.math.function.TimesFunction;
+
+public abstract class AbstractClusteringPolicy implements ClusteringPolicy {
+
+ @Override
+ public abstract void write(DataOutput out) throws IOException;
+
+ @Override
+ public abstract void readFields(DataInput in) throws IOException;
+
+ @Override
+ public Vector select(Vector probabilities) {
+ int maxValueIndex = probabilities.maxValueIndex();
+ Vector weights = new SequentialAccessSparseVector(probabilities.size());
+ weights.set(maxValueIndex, 1.0);
+ return weights;
+ }
+
+ @Override
+ public void update(ClusterClassifier posterior) {
+ // nothing to do in general here
+ }
+
+ @Override
+ public Vector classify(Vector data, ClusterClassifier prior) {
+ List<Cluster> models = prior.getModels();
+ int i = 0;
+ Vector pdfs = new DenseVector(models.size());
+ for (Cluster model : models) {
+ pdfs.set(i++, model.pdf(new VectorWritable(data)));
+ }
+ return pdfs.assign(new TimesFunction(), 1.0 / pdfs.zSum());
+ }
+
+ @Override
+ public void close(ClusterClassifier posterior) {
+ for (Cluster cluster : posterior.getModels()) {
+ cluster.computeParameters();
+ }
+
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/CIMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/CIMapper.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/CIMapper.java
new file mode 100644
index 0000000..fb2db49
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/CIMapper.java
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.iterator;
+
+import java.io.IOException;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.WritableComparable;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.clustering.Cluster;
+import org.apache.mahout.clustering.classify.ClusterClassifier;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.Vector.Element;
+import org.apache.mahout.math.VectorWritable;
+
+public class CIMapper extends Mapper<WritableComparable<?>,VectorWritable,IntWritable,ClusterWritable> {
+
+ private ClusterClassifier classifier;
+ private ClusteringPolicy policy;
+
+ @Override
+ protected void setup(Context context) throws IOException, InterruptedException {
+ Configuration conf = context.getConfiguration();
+ String priorClustersPath = conf.get(ClusterIterator.PRIOR_PATH_KEY);
+ classifier = new ClusterClassifier();
+ classifier.readFromSeqFiles(conf, new Path(priorClustersPath));
+ policy = classifier.getPolicy();
+ policy.update(classifier);
+ super.setup(context);
+ }
+
+ @Override
+ protected void map(WritableComparable<?> key, VectorWritable value, Context context) throws IOException,
+ InterruptedException {
+ Vector probabilities = classifier.classify(value.get());
+ Vector selections = policy.select(probabilities);
+ for (Element el : selections.nonZeroes()) {
+ classifier.train(el.index(), value.get(), el.get());
+ }
+ }
+
+ @Override
+ protected void cleanup(Context context) throws IOException, InterruptedException {
+ List<Cluster> clusters = classifier.getModels();
+ ClusterWritable cw = new ClusterWritable();
+ for (int index = 0; index < clusters.size(); index++) {
+ cw.setValue(clusters.get(index));
+ context.write(new IntWritable(index), cw);
+ }
+ super.cleanup(context);
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/CIReducer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/CIReducer.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/CIReducer.java
new file mode 100644
index 0000000..ca63b0f
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/CIReducer.java
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.iterator;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.mahout.clustering.Cluster;
+import org.apache.mahout.clustering.classify.ClusterClassifier;
+
+public class CIReducer extends Reducer<IntWritable,ClusterWritable,IntWritable,ClusterWritable> {
+
+ private ClusterClassifier classifier;
+ private ClusteringPolicy policy;
+
+ @Override
+ protected void reduce(IntWritable key, Iterable<ClusterWritable> values, Context context) throws IOException,
+ InterruptedException {
+ Iterator<ClusterWritable> iter = values.iterator();
+ Cluster first = iter.next().getValue(); // there must always be at least one
+ while (iter.hasNext()) {
+ Cluster cluster = iter.next().getValue();
+ first.observe(cluster);
+ }
+ List<Cluster> models = new ArrayList<>();
+ models.add(first);
+ classifier = new ClusterClassifier(models, policy);
+ classifier.close();
+ context.write(key, new ClusterWritable(first));
+ }
+
+ @Override
+ protected void setup(Context context) throws IOException, InterruptedException {
+ Configuration conf = context.getConfiguration();
+ String priorClustersPath = conf.get(ClusterIterator.PRIOR_PATH_KEY);
+ classifier = new ClusterClassifier();
+ classifier.readFromSeqFiles(conf, new Path(priorClustersPath));
+ policy = classifier.getPolicy();
+ policy.update(classifier);
+ super.setup(context);
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/CanopyClusteringPolicy.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/CanopyClusteringPolicy.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/CanopyClusteringPolicy.java
new file mode 100644
index 0000000..c9a0940
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/CanopyClusteringPolicy.java
@@ -0,0 +1,52 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.clustering.iterator;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.mahout.math.SequentialAccessSparseVector;
+import org.apache.mahout.math.Vector;
+
+@Deprecated
+public class CanopyClusteringPolicy extends AbstractClusteringPolicy {
+
+ private double t1;
+ private double t2;
+
+ @Override
+ public Vector select(Vector probabilities) {
+ int maxValueIndex = probabilities.maxValueIndex();
+ Vector weights = new SequentialAccessSparseVector(probabilities.size());
+ weights.set(maxValueIndex, 1.0);
+ return weights;
+ }
+
+ @Override
+ public void write(DataOutput out) throws IOException {
+ out.writeDouble(t1);
+ out.writeDouble(t2);
+ }
+
+ @Override
+ public void readFields(DataInput in) throws IOException {
+ this.t1 = in.readDouble();
+ this.t2 = in.readDouble();
+ }
+
+}
r***@apache.org
2018-06-28 14:54:35 UTC
Permalink
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/ClusterIterator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/ClusterIterator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/ClusterIterator.java
new file mode 100644
index 0000000..516177f
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/ClusterIterator.java
@@ -0,0 +1,219 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.clustering.iterator;
+
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.mahout.clustering.Cluster;
+import org.apache.mahout.clustering.classify.ClusterClassifier;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
+import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterable;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileValueIterator;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+
+import com.google.common.io.Closeables;
+
+/**
+ * This is a clustering iterator which works with a set of Vector data and a prior ClusterClassifier which has been
+ * initialized with a set of models. Its implementation is algorithm-neutral and works for any iterative clustering
+ * algorithm (currently k-means and fuzzy-k-means) that processes all the input vectors in each iteration.
+ * The cluster classifier is configured with a ClusteringPolicy to select the desired clustering algorithm.
+ */
+public final class ClusterIterator {
+
+ public static final String PRIOR_PATH_KEY = "org.apache.mahout.clustering.prior.path";
+
+ private ClusterIterator() {
+ }
+
+ /**
+ * Iterate over data using a prior-trained ClusterClassifier, for a number of iterations
+ *
+ * @param data
+ * a {@code List<Vector>} of input vectors
+ * @param classifier
+ * a prior ClusterClassifier
+ * @param numIterations
+ * the int number of iterations to perform
+ *
+ * @return the posterior ClusterClassifier
+ */
+ public static ClusterClassifier iterate(Iterable<Vector> data, ClusterClassifier classifier, int numIterations) {
+ ClusteringPolicy policy = classifier.getPolicy();
+ for (int iteration = 1; iteration <= numIterations; iteration++) {
+ for (Vector vector : data) {
+ // update the policy based upon the prior
+ policy.update(classifier);
+ // classification yields probabilities
+ Vector probabilities = classifier.classify(vector);
+ // policy selects weights for models given those probabilities
+ Vector weights = policy.select(probabilities);
+ // training causes all models to observe data
+ for (Vector.Element e : weights.nonZeroes()) {
+ int index = e.index();
+ classifier.train(index, vector, weights.get(index));
+ }
+ }
+ // compute the posterior models
+ classifier.close();
+ }
+ return classifier;
+ }
+
+ /**
+ * Iterate over data using a prior-trained ClusterClassifier, for a number of iterations using a sequential
+ * implementation
+ *
+ * @param conf
+ * the Configuration
+ * @param inPath
+ * a Path to input VectorWritables
+ * @param priorPath
+ * a Path to the prior classifier
+ * @param outPath
+ * a Path of output directory
+ * @param numIterations
+ * the int number of iterations to perform
+ */
+ public static void iterateSeq(Configuration conf, Path inPath, Path priorPath, Path outPath, int numIterations)
+ throws IOException {
+ ClusterClassifier classifier = new ClusterClassifier();
+ classifier.readFromSeqFiles(conf, priorPath);
+ Path clustersOut = null;
+ int iteration = 1;
+ while (iteration <= numIterations) {
+ for (VectorWritable vw : new SequenceFileDirValueIterable<VectorWritable>(inPath, PathType.LIST,
+ PathFilters.logsCRCFilter(), conf)) {
+ Vector vector = vw.get();
+ // classification yields probabilities
+ Vector probabilities = classifier.classify(vector);
+ // policy selects weights for models given those probabilities
+ Vector weights = classifier.getPolicy().select(probabilities);
+ // training causes all models to observe data
+ for (Vector.Element e : weights.nonZeroes()) {
+ int index = e.index();
+ classifier.train(index, vector, weights.get(index));
+ }
+ }
+ // compute the posterior models
+ classifier.close();
+ // update the policy
+ classifier.getPolicy().update(classifier);
+ // output the classifier
+ clustersOut = new Path(outPath, Cluster.CLUSTERS_DIR + iteration);
+ classifier.writeToSeqFiles(clustersOut);
+ FileSystem fs = FileSystem.get(outPath.toUri(), conf);
+ iteration++;
+ if (isConverged(clustersOut, conf, fs)) {
+ break;
+ }
+ }
+ Path finalClustersIn = new Path(outPath, Cluster.CLUSTERS_DIR + (iteration - 1) + Cluster.FINAL_ITERATION_SUFFIX);
+ FileSystem.get(clustersOut.toUri(), conf).rename(clustersOut, finalClustersIn);
+ }
+
+ /**
+ * Iterate over data using a prior-trained ClusterClassifier, for a number of iterations using a mapreduce
+ * implementation
+ *
+ * @param conf
+ * the Configuration
+ * @param inPath
+ * a Path to input VectorWritables
+ * @param priorPath
+ * a Path to the prior classifier
+ * @param outPath
+ * a Path of output directory
+ * @param numIterations
+ * the int number of iterations to perform
+ */
+ public static void iterateMR(Configuration conf, Path inPath, Path priorPath, Path outPath, int numIterations)
+ throws IOException, InterruptedException, ClassNotFoundException {
+ ClusteringPolicy policy = ClusterClassifier.readPolicy(priorPath);
+ Path clustersOut = null;
+ int iteration = 1;
+ while (iteration <= numIterations) {
+ conf.set(PRIOR_PATH_KEY, priorPath.toString());
+
+ String jobName = "Cluster Iterator running iteration " + iteration + " over priorPath: " + priorPath;
+ Job job = new Job(conf, jobName);
+ job.setMapOutputKeyClass(IntWritable.class);
+ job.setMapOutputValueClass(ClusterWritable.class);
+ job.setOutputKeyClass(IntWritable.class);
+ job.setOutputValueClass(ClusterWritable.class);
+
+ job.setInputFormatClass(SequenceFileInputFormat.class);
+ job.setOutputFormatClass(SequenceFileOutputFormat.class);
+ job.setMapperClass(CIMapper.class);
+ job.setReducerClass(CIReducer.class);
+
+ FileInputFormat.addInputPath(job, inPath);
+ clustersOut = new Path(outPath, Cluster.CLUSTERS_DIR + iteration);
+ priorPath = clustersOut;
+ FileOutputFormat.setOutputPath(job, clustersOut);
+
+ job.setJarByClass(ClusterIterator.class);
+ if (!job.waitForCompletion(true)) {
+ throw new InterruptedException("Cluster Iteration " + iteration + " failed processing " + priorPath);
+ }
+ ClusterClassifier.writePolicy(policy, clustersOut);
+ FileSystem fs = FileSystem.get(outPath.toUri(), conf);
+ iteration++;
+ if (isConverged(clustersOut, conf, fs)) {
+ break;
+ }
+ }
+ Path finalClustersIn = new Path(outPath, Cluster.CLUSTERS_DIR + (iteration - 1) + Cluster.FINAL_ITERATION_SUFFIX);
+ FileSystem.get(clustersOut.toUri(), conf).rename(clustersOut, finalClustersIn);
+ }
+
+ /**
+ * Return if all of the Clusters in the parts in the filePath have converged or not
+ *
+ * @param filePath
+ * the file path to the single file containing the clusters
+ * @return true if all Clusters are converged
+ * @throws IOException
+ * if there was an IO error
+ */
+ private static boolean isConverged(Path filePath, Configuration conf, FileSystem fs) throws IOException {
+ for (FileStatus part : fs.listStatus(filePath, PathFilters.partFilter())) {
+ SequenceFileValueIterator<ClusterWritable> iterator = new SequenceFileValueIterator<>(
+ part.getPath(), true, conf);
+ while (iterator.hasNext()) {
+ ClusterWritable value = iterator.next();
+ if (!value.getValue().isConverged()) {
+ Closeables.close(iterator, true);
+ return false;
+ }
+ }
+ }
+ return true;
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/ClusterWritable.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/ClusterWritable.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/ClusterWritable.java
new file mode 100644
index 0000000..855685f
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/ClusterWritable.java
@@ -0,0 +1,56 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.clustering.iterator;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.classifier.sgd.PolymorphicWritable;
+import org.apache.mahout.clustering.Cluster;
+
+public class ClusterWritable implements Writable {
+
+ private Cluster value;
+
+ public ClusterWritable(Cluster first) {
+ value = first;
+ }
+
+ public ClusterWritable() {
+ }
+
+ public Cluster getValue() {
+ return value;
+ }
+
+ public void setValue(Cluster value) {
+ this.value = value;
+ }
+
+ @Override
+ public void write(DataOutput out) throws IOException {
+ PolymorphicWritable.write(out, value);
+ }
+
+ @Override
+ public void readFields(DataInput in) throws IOException {
+ value = PolymorphicWritable.read(in, Cluster.class);
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/ClusteringPolicy.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/ClusteringPolicy.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/ClusteringPolicy.java
new file mode 100644
index 0000000..6e15838
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/ClusteringPolicy.java
@@ -0,0 +1,66 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.clustering.iterator;
+
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.clustering.classify.ClusterClassifier;
+import org.apache.mahout.math.Vector;
+
+/**
+ * A ClusteringPolicy captures the semantics of assignment of points to clusters
+ *
+ */
+public interface ClusteringPolicy extends Writable {
+
+ /**
+ * Classify the data vector given the classifier's models
+ *
+ * @param data
+ * a data Vector
+ * @param prior
+ * a prior ClusterClassifier
+ * @return a Vector of probabilities that the data is described by each of the
+ * models
+ */
+ Vector classify(Vector data, ClusterClassifier prior);
+
+ /**
+ * Return a vector of weights for each of the models given those probabilities
+ *
+ * @param probabilities
+ * a Vector of pdfs
+ * @return a Vector of weights
+ */
+ Vector select(Vector probabilities);
+
+ /**
+ * Update the policy with the given classifier
+ *
+ * @param posterior
+ * a ClusterClassifier
+ */
+ void update(ClusterClassifier posterior);
+
+ /**
+ * Close the policy using the classifier's models
+ *
+ * @param posterior
+ * a posterior ClusterClassifier
+ */
+ void close(ClusterClassifier posterior);
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/ClusteringPolicyWritable.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/ClusteringPolicyWritable.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/ClusteringPolicyWritable.java
new file mode 100644
index 0000000..f69442d
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/ClusteringPolicyWritable.java
@@ -0,0 +1,55 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.clustering.iterator;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.classifier.sgd.PolymorphicWritable;
+
+public class ClusteringPolicyWritable implements Writable {
+
+ private ClusteringPolicy value;
+
+ public ClusteringPolicyWritable(ClusteringPolicy policy) {
+ this.value = policy;
+ }
+
+ public ClusteringPolicyWritable() {
+ }
+
+ public ClusteringPolicy getValue() {
+ return value;
+ }
+
+ public void setValue(ClusteringPolicy value) {
+ this.value = value;
+ }
+
+ @Override
+ public void write(DataOutput out) throws IOException {
+ PolymorphicWritable.write(out, value);
+ }
+
+ @Override
+ public void readFields(DataInput in) throws IOException {
+ value = PolymorphicWritable.read(in, ClusteringPolicy.class);
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/DistanceMeasureCluster.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/DistanceMeasureCluster.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/DistanceMeasureCluster.java
new file mode 100644
index 0000000..f61aa27
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/DistanceMeasureCluster.java
@@ -0,0 +1,91 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.iterator;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.mahout.clustering.AbstractCluster;
+import org.apache.mahout.clustering.Model;
+import org.apache.mahout.common.ClassUtils;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+
+public class DistanceMeasureCluster extends AbstractCluster {
+
+ private DistanceMeasure measure;
+
+ public DistanceMeasureCluster(Vector point, int id, DistanceMeasure measure) {
+ super(point, id);
+ this.measure = measure;
+ }
+
+ public DistanceMeasureCluster() {
+ }
+
+ @Override
+ public void configure(Configuration job) {
+ if (measure != null) {
+ measure.configure(job);
+ }
+ }
+
+ @Override
+ public void readFields(DataInput in) throws IOException {
+ String dm = in.readUTF();
+ this.measure = ClassUtils.instantiateAs(dm, DistanceMeasure.class);
+ super.readFields(in);
+ }
+
+ @Override
+ public void write(DataOutput out) throws IOException {
+ out.writeUTF(measure.getClass().getName());
+ super.write(out);
+ }
+
+ @Override
+ public double pdf(VectorWritable vw) {
+ return 1 / (1 + measure.distance(vw.get(), getCenter()));
+ }
+
+ @Override
+ public Model<VectorWritable> sampleFromPosterior() {
+ return new DistanceMeasureCluster(getCenter(), getId(), measure);
+ }
+
+ public DistanceMeasure getMeasure() {
+ return measure;
+ }
+
+ /**
+ * @param measure
+ * the measure to set
+ */
+ public void setMeasure(DistanceMeasure measure) {
+ this.measure = measure;
+ }
+
+ @Override
+ public String getIdentifier() {
+ return "DMC:" + getId();
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/FuzzyKMeansClusteringPolicy.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/FuzzyKMeansClusteringPolicy.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/FuzzyKMeansClusteringPolicy.java
new file mode 100644
index 0000000..b4e41b6
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/FuzzyKMeansClusteringPolicy.java
@@ -0,0 +1,90 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.clustering.iterator;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+
+import org.apache.mahout.clustering.Cluster;
+import org.apache.mahout.clustering.classify.ClusterClassifier;
+import org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansClusterer;
+import org.apache.mahout.clustering.fuzzykmeans.SoftCluster;
+import org.apache.mahout.math.Vector;
+
+/**
+ * This is a probability-weighted clustering policy, suitable for fuzzy k-means
+ * clustering
+ *
+ */
+public class FuzzyKMeansClusteringPolicy extends AbstractClusteringPolicy {
+
+ private double m = 2;
+ private double convergenceDelta = 0.05;
+
+ public FuzzyKMeansClusteringPolicy() {
+ }
+
+ public FuzzyKMeansClusteringPolicy(double m, double convergenceDelta) {
+ this.m = m;
+ this.convergenceDelta = convergenceDelta;
+ }
+
+ @Override
+ public Vector select(Vector probabilities) {
+ return probabilities;
+ }
+
+ @Override
+ public Vector classify(Vector data, ClusterClassifier prior) {
+ Collection<SoftCluster> clusters = new ArrayList<>();
+ List<Double> distances = new ArrayList<>();
+ for (Cluster model : prior.getModels()) {
+ SoftCluster sc = (SoftCluster) model;
+ clusters.add(sc);
+ distances.add(sc.getMeasure().distance(data, sc.getCenter()));
+ }
+ FuzzyKMeansClusterer fuzzyKMeansClusterer = new FuzzyKMeansClusterer();
+ fuzzyKMeansClusterer.setM(m);
+ return fuzzyKMeansClusterer.computePi(clusters, distances);
+ }
+
+ @Override
+ public void write(DataOutput out) throws IOException {
+ out.writeDouble(m);
+ out.writeDouble(convergenceDelta);
+ }
+
+ @Override
+ public void readFields(DataInput in) throws IOException {
+ this.m = in.readDouble();
+ this.convergenceDelta = in.readDouble();
+ }
+
+ @Override
+ public void close(ClusterClassifier posterior) {
+ for (Cluster cluster : posterior.getModels()) {
+ ((org.apache.mahout.clustering.kmeans.Kluster) cluster).calculateConvergence(convergenceDelta);
+ cluster.computeParameters();
+ }
+
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/KMeansClusteringPolicy.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/KMeansClusteringPolicy.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/KMeansClusteringPolicy.java
new file mode 100644
index 0000000..1cc9faf
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/iterator/KMeansClusteringPolicy.java
@@ -0,0 +1,64 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.clustering.iterator;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.mahout.clustering.Cluster;
+import org.apache.mahout.clustering.classify.ClusterClassifier;
+
+/**
+ * This is a simple maximum likelihood clustering policy, suitable for k-means
+ * clustering
+ *
+ */
+public class KMeansClusteringPolicy extends AbstractClusteringPolicy {
+
+ public KMeansClusteringPolicy() {
+ }
+
+ public KMeansClusteringPolicy(double convergenceDelta) {
+ this.convergenceDelta = convergenceDelta;
+ }
+
+ private double convergenceDelta = 0.001;
+
+ @Override
+ public void write(DataOutput out) throws IOException {
+ out.writeDouble(convergenceDelta);
+ }
+
+ @Override
+ public void readFields(DataInput in) throws IOException {
+ this.convergenceDelta = in.readDouble();
+ }
+
+ @Override
+ public void close(ClusterClassifier posterior) {
+ boolean allConverged = true;
+ for (Cluster cluster : posterior.getModels()) {
+ org.apache.mahout.clustering.kmeans.Kluster kluster = (org.apache.mahout.clustering.kmeans.Kluster) cluster;
+ boolean converged = kluster.calculateConvergence(convergenceDelta);
+ allConverged = allConverged && converged;
+ cluster.computeParameters();
+ }
+
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/kernel/IKernelProfile.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/kernel/IKernelProfile.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/kernel/IKernelProfile.java
new file mode 100644
index 0000000..96c4082
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/kernel/IKernelProfile.java
@@ -0,0 +1,27 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.kernel;
+
+public interface IKernelProfile {
+
+ /**
+ * @return the calculated dervative value of the kernel
+ */
+ double calculateDerivativeValue(double distance, double h);
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/kernel/TriangularKernelProfile.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/kernel/TriangularKernelProfile.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/kernel/TriangularKernelProfile.java
new file mode 100644
index 0000000..46909bb
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/kernel/TriangularKernelProfile.java
@@ -0,0 +1,27 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.kernel;
+
+public class TriangularKernelProfile implements IKernelProfile {
+
+ @Override
+ public double calculateDerivativeValue(double distance, double h) {
+ return distance < h ? 1.0 : 0.0;
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java
new file mode 100644
index 0000000..3b9094e
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java
@@ -0,0 +1,257 @@
+/* Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.clustering.kmeans;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.clustering.Cluster;
+import org.apache.mahout.clustering.classify.ClusterClassificationDriver;
+import org.apache.mahout.clustering.classify.ClusterClassifier;
+import org.apache.mahout.clustering.iterator.ClusterIterator;
+import org.apache.mahout.clustering.iterator.ClusteringPolicy;
+import org.apache.mahout.clustering.iterator.KMeansClusteringPolicy;
+import org.apache.mahout.clustering.topdown.PathDirectory;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.ClassUtils;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class KMeansDriver extends AbstractJob {
+
+ private static final Logger log = LoggerFactory.getLogger(KMeansDriver.class);
+
+ public static void main(String[] args) throws Exception {
+ ToolRunner.run(new Configuration(), new KMeansDriver(), args);
+ }
+
+ @Override
+ public int run(String[] args) throws Exception {
+
+ addInputOption();
+ addOutputOption();
+ addOption(DefaultOptionCreator.distanceMeasureOption().create());
+ addOption(DefaultOptionCreator
+ .clustersInOption()
+ .withDescription(
+ "The input centroids, as Vectors. Must be a SequenceFile of Writable, Cluster/Canopy. "
+ + "If k is also specified, then a random set of vectors will be selected"
+ + " and written out to this path first").create());
+ addOption(DefaultOptionCreator
+ .numClustersOption()
+ .withDescription(
+ "The k in k-Means. If specified, then a random selection of k Vectors will be chosen"
+ + " as the Centroid and written to the clusters input path.").create());
+ addOption(DefaultOptionCreator.useSetRandomSeedOption().create());
+ addOption(DefaultOptionCreator.convergenceOption().create());
+ addOption(DefaultOptionCreator.maxIterationsOption().create());
+ addOption(DefaultOptionCreator.overwriteOption().create());
+ addOption(DefaultOptionCreator.clusteringOption().create());
+ addOption(DefaultOptionCreator.methodOption().create());
+ addOption(DefaultOptionCreator.outlierThresholdOption().create());
+
+ if (parseArguments(args) == null) {
+ return -1;
+ }
+
+ Path input = getInputPath();
+ Path clusters = new Path(getOption(DefaultOptionCreator.CLUSTERS_IN_OPTION));
+ Path output = getOutputPath();
+ String measureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION);
+ if (measureClass == null) {
+ measureClass = SquaredEuclideanDistanceMeasure.class.getName();
+ }
+ double convergenceDelta = Double.parseDouble(getOption(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION));
+ int maxIterations = Integer.parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION));
+ if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
+ HadoopUtil.delete(getConf(), output);
+ }
+ DistanceMeasure measure = ClassUtils.instantiateAs(measureClass, DistanceMeasure.class);
+
+ if (hasOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION)) {
+ int numClusters = Integer.parseInt(getOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION));
+
+ Long seed = null;
+ if (hasOption(DefaultOptionCreator.RANDOM_SEED)) {
+ seed = Long.parseLong(getOption(DefaultOptionCreator.RANDOM_SEED));
+ }
+
+ clusters = RandomSeedGenerator.buildRandom(getConf(), input, clusters, numClusters, measure, seed);
+ }
+ boolean runClustering = hasOption(DefaultOptionCreator.CLUSTERING_OPTION);
+ boolean runSequential = getOption(DefaultOptionCreator.METHOD_OPTION).equalsIgnoreCase(
+ DefaultOptionCreator.SEQUENTIAL_METHOD);
+ double clusterClassificationThreshold = 0.0;
+ if (hasOption(DefaultOptionCreator.OUTLIER_THRESHOLD)) {
+ clusterClassificationThreshold = Double.parseDouble(getOption(DefaultOptionCreator.OUTLIER_THRESHOLD));
+ }
+ run(getConf(), input, clusters, output, convergenceDelta, maxIterations, runClustering,
+ clusterClassificationThreshold, runSequential);
+ return 0;
+ }
+
+ /**
+ * Iterate over the input vectors to produce clusters and, if requested, use the results of the final iteration to
+ * cluster the input vectors.
+ *
+ * @param input
+ * the directory pathname for input points
+ * @param clustersIn
+ * the directory pathname for initial & computed clusters
+ * @param output
+ * the directory pathname for output points
+ * @param convergenceDelta
+ * the convergence delta value
+ * @param maxIterations
+ * the maximum number of iterations
+ * @param runClustering
+ * true if points are to be clustered after iterations are completed
+ * @param clusterClassificationThreshold
+ * Is a clustering strictness / outlier removal parameter. Its value should be between 0 and 1. Vectors
+ * having pdf below this value will not be clustered.
+ * @param runSequential
+ * if true execute sequential algorithm
+ */
+ public static void run(Configuration conf, Path input, Path clustersIn, Path output,
+ double convergenceDelta, int maxIterations, boolean runClustering, double clusterClassificationThreshold,
+ boolean runSequential) throws IOException, InterruptedException, ClassNotFoundException {
+
+ // iterate until the clusters converge
+ String delta = Double.toString(convergenceDelta);
+ if (log.isInfoEnabled()) {
+ log.info("Input: {} Clusters In: {} Out: {}", input, clustersIn, output);
+ log.info("convergence: {} max Iterations: {}", convergenceDelta, maxIterations);
+ }
+ Path clustersOut = buildClusters(conf, input, clustersIn, output, maxIterations, delta, runSequential);
+ if (runClustering) {
+ log.info("Clustering data");
+ clusterData(conf, input, clustersOut, output, clusterClassificationThreshold, runSequential);
+ }
+ }
+
+ /**
+ * Iterate over the input vectors to produce clusters and, if requested, use the results of the final iteration to
+ * cluster the input vectors.
+ *
+ * @param input
+ * the directory pathname for input points
+ * @param clustersIn
+ * the directory pathname for initial & computed clusters
+ * @param output
+ * the directory pathname for output points
+ * @param convergenceDelta
+ * the convergence delta value
+ * @param maxIterations
+ * the maximum number of iterations
+ * @param runClustering
+ * true if points are to be clustered after iterations are completed
+ * @param clusterClassificationThreshold
+ * Is a clustering strictness / outlier removal parameter. Its value should be between 0 and 1. Vectors
+ * having pdf below this value will not be clustered.
+ * @param runSequential
+ * if true execute sequential algorithm
+ */
+ public static void run(Path input, Path clustersIn, Path output, double convergenceDelta,
+ int maxIterations, boolean runClustering, double clusterClassificationThreshold, boolean runSequential)
+ throws IOException, InterruptedException, ClassNotFoundException {
+ run(new Configuration(), input, clustersIn, output, convergenceDelta, maxIterations, runClustering,
+ clusterClassificationThreshold, runSequential);
+ }
+
+ /**
+ * Iterate over the input vectors to produce cluster directories for each iteration
+ *
+ *
+ * @param conf
+ * the Configuration to use
+ * @param input
+ * the directory pathname for input points
+ * @param clustersIn
+ * the directory pathname for initial & computed clusters
+ * @param output
+ * the directory pathname for output points
+ * @param maxIterations
+ * the maximum number of iterations
+ * @param delta
+ * the convergence delta value
+ * @param runSequential
+ * if true execute sequential algorithm
+ *
+ * @return the Path of the final clusters directory
+ */
+ public static Path buildClusters(Configuration conf, Path input, Path clustersIn, Path output,
+ int maxIterations, String delta, boolean runSequential) throws IOException,
+ InterruptedException, ClassNotFoundException {
+
+ double convergenceDelta = Double.parseDouble(delta);
+ List<Cluster> clusters = new ArrayList<>();
+ KMeansUtil.configureWithClusterInfo(conf, clustersIn, clusters);
+
+ if (clusters.isEmpty()) {
+ throw new IllegalStateException("No input clusters found in " + clustersIn + ". Check your -c argument.");
+ }
+
+ Path priorClustersPath = new Path(output, Cluster.INITIAL_CLUSTERS_DIR);
+ ClusteringPolicy policy = new KMeansClusteringPolicy(convergenceDelta);
+ ClusterClassifier prior = new ClusterClassifier(clusters, policy);
+ prior.writeToSeqFiles(priorClustersPath);
+
+ if (runSequential) {
+ ClusterIterator.iterateSeq(conf, input, priorClustersPath, output, maxIterations);
+ } else {
+ ClusterIterator.iterateMR(conf, input, priorClustersPath, output, maxIterations);
+ }
+ return output;
+ }
+
+ /**
+ * Run the job using supplied arguments
+ *
+ * @param input
+ * the directory pathname for input points
+ * @param clustersIn
+ * the directory pathname for input clusters
+ * @param output
+ * the directory pathname for output points
+ * @param clusterClassificationThreshold
+ * Is a clustering strictness / outlier removal parameter. Its value should be between 0 and 1. Vectors
+ * having pdf below this value will not be clustered.
+ * @param runSequential
+ * if true execute sequential algorithm
+ */
+ public static void clusterData(Configuration conf, Path input, Path clustersIn, Path output,
+ double clusterClassificationThreshold, boolean runSequential) throws IOException, InterruptedException,
+ ClassNotFoundException {
+
+ if (log.isInfoEnabled()) {
+ log.info("Running Clustering");
+ log.info("Input: {} Clusters In: {} Out: {}", input, clustersIn, output);
+ }
+ ClusterClassifier.writePolicy(new KMeansClusteringPolicy(), clustersIn);
+ ClusterClassificationDriver.run(conf, input, output, new Path(output, PathDirectory.CLUSTERED_POINTS_DIRECTORY),
+ clusterClassificationThreshold, true, runSequential);
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/kmeans/KMeansUtil.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/kmeans/KMeansUtil.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/kmeans/KMeansUtil.java
new file mode 100644
index 0000000..3365f70
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/kmeans/KMeansUtil.java
@@ -0,0 +1,74 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.kmeans;
+
+import java.util.Collection;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.clustering.Cluster;
+import org.apache.mahout.clustering.canopy.Canopy;
+import org.apache.mahout.clustering.iterator.ClusterWritable;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
+import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterable;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+final class KMeansUtil {
+
+ private static final Logger log = LoggerFactory.getLogger(KMeansUtil.class);
+
+ private KMeansUtil() {}
+
+ /**
+ * Create a list of Klusters from whatever Cluster type is passed in as the prior
+ *
+ * @param conf
+ * the Configuration
+ * @param clusterPath
+ * the path to the prior Clusters
+ * @param clusters
+ * a List<Cluster> to put values into
+ */
+ public static void configureWithClusterInfo(Configuration conf, Path clusterPath, Collection<Cluster> clusters) {
+ for (Writable value : new SequenceFileDirValueIterable<>(clusterPath, PathType.LIST,
+ PathFilters.partFilter(), conf)) {
+ Class<? extends Writable> valueClass = value.getClass();
+ if (valueClass.equals(ClusterWritable.class)) {
+ ClusterWritable clusterWritable = (ClusterWritable) value;
+ value = clusterWritable.getValue();
+ valueClass = value.getClass();
+ }
+ log.debug("Read 1 Cluster from {}", clusterPath);
+
+ if (valueClass.equals(Kluster.class)) {
+ // get the cluster info
+ clusters.add((Kluster) value);
+ } else if (valueClass.equals(Canopy.class)) {
+ // get the cluster info
+ Canopy canopy = (Canopy) value;
+ clusters.add(new Kluster(canopy.getCenter(), canopy.getId(), canopy.getMeasure()));
+ } else {
+ throw new IllegalStateException("Bad value class: " + valueClass);
+ }
+ }
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/kmeans/Kluster.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/kmeans/Kluster.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/kmeans/Kluster.java
new file mode 100644
index 0000000..15daec5
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/kmeans/Kluster.java
@@ -0,0 +1,117 @@
+/* Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.clustering.kmeans;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.mahout.clustering.iterator.DistanceMeasureCluster;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.math.Vector;
+
+public class Kluster extends DistanceMeasureCluster {
+
+ /** Has the centroid converged with the center? */
+ private boolean converged;
+
+ /** For (de)serialization as a Writable */
+ public Kluster() {
+ }
+
+ /**
+ * Construct a new cluster with the given point as its center
+ *
+ * @param center
+ * the Vector center
+ * @param clusterId
+ * the int cluster id
+ * @param measure
+ * a DistanceMeasure
+ */
+ public Kluster(Vector center, int clusterId, DistanceMeasure measure) {
+ super(center, clusterId, measure);
+ }
+
+ /**
+ * Format the cluster for output
+ *
+ * @param cluster
+ * the Cluster
+ * @return the String representation of the Cluster
+ */
+ public static String formatCluster(Kluster cluster) {
+ return cluster.getIdentifier() + ": " + cluster.computeCentroid().asFormatString();
+ }
+
+ public String asFormatString() {
+ return formatCluster(this);
+ }
+
+ @Override
+ public void write(DataOutput out) throws IOException {
+ super.write(out);
+ out.writeBoolean(converged);
+ }
+
+ @Override
+ public void readFields(DataInput in) throws IOException {
+ super.readFields(in);
+ this.converged = in.readBoolean();
+ }
+
+ @Override
+ public String toString() {
+ return asFormatString(null);
+ }
+
+ @Override
+ public String getIdentifier() {
+ return (converged ? "VL-" : "CL-") + getId();
+ }
+
+ /**
+ * Return if the cluster is converged by comparing its center and centroid.
+ *
+ * @param measure
+ * The distance measure to use for cluster-point comparisons.
+ * @param convergenceDelta
+ * the convergence delta to use for stopping.
+ * @return if the cluster is converged
+ */
+ public boolean computeConvergence(DistanceMeasure measure, double convergenceDelta) {
+ Vector centroid = computeCentroid();
+ converged = measure.distance(centroid.getLengthSquared(), centroid, getCenter()) <= convergenceDelta;
+ return converged;
+ }
+
+ @Override
+ public boolean isConverged() {
+ return converged;
+ }
+
+ protected void setConverged(boolean converged) {
+ this.converged = converged;
+ }
+
+ public boolean calculateConvergence(double convergenceDelta) {
+ Vector centroid = computeCentroid();
+ converged = getMeasure().distance(centroid.getLengthSquared(), centroid, getCenter()) <= convergenceDelta;
+ return converged;
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/kmeans/RandomSeedGenerator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/kmeans/RandomSeedGenerator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/kmeans/RandomSeedGenerator.java
new file mode 100644
index 0000000..fbbabc5
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/kmeans/RandomSeedGenerator.java
@@ -0,0 +1,136 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.kmeans;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Random;
+
+import com.google.common.base.Preconditions;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.clustering.iterator.ClusterWritable;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable;
+import org.apache.mahout.math.VectorWritable;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Given an Input Path containing a {@link org.apache.hadoop.io.SequenceFile}, randomly select k vectors and
+ * write them to the output file as a {@link org.apache.mahout.clustering.kmeans.Kluster} representing the
+ * initial centroid to use.
+ *
+ * This implementation uses reservoir sampling as described in http://en.wikipedia.org/wiki/Reservoir_sampling
+ */
+public final class RandomSeedGenerator {
+
+ private static final Logger log = LoggerFactory.getLogger(RandomSeedGenerator.class);
+
+ public static final String K = "k";
+
+ private RandomSeedGenerator() {}
+
+ public static Path buildRandom(Configuration conf, Path input, Path output, int k, DistanceMeasure measure)
+ throws IOException {
+ return buildRandom(conf, input, output, k, measure, null);
+ }
+
+ public static Path buildRandom(Configuration conf,
+ Path input,
+ Path output,
+ int k,
+ DistanceMeasure measure,
+ Long seed) throws IOException {
+
+ Preconditions.checkArgument(k > 0, "Must be: k > 0, but k = " + k);
+ // delete the output directory
+ FileSystem fs = FileSystem.get(output.toUri(), conf);
+ HadoopUtil.delete(conf, output);
+ Path outFile = new Path(output, "part-randomSeed");
+ boolean newFile = fs.createNewFile(outFile);
+ if (newFile) {
+ Path inputPathPattern;
+
+ if (fs.getFileStatus(input).isDir()) {
+ inputPathPattern = new Path(input, "*");
+ } else {
+ inputPathPattern = input;
+ }
+
+ FileStatus[] inputFiles = fs.globStatus(inputPathPattern, PathFilters.logsCRCFilter());
+
+ Random random = (seed != null) ? RandomUtils.getRandom(seed) : RandomUtils.getRandom();
+
+ List<Text> chosenTexts = new ArrayList<>(k);
+ List<ClusterWritable> chosenClusters = new ArrayList<>(k);
+ int nextClusterId = 0;
+
+ int index = 0;
+ for (FileStatus fileStatus : inputFiles) {
+ if (!fileStatus.isDir()) {
+ for (Pair<Writable, VectorWritable> record
+ : new SequenceFileIterable<Writable, VectorWritable>(fileStatus.getPath(), true, conf)) {
+ Writable key = record.getFirst();
+ VectorWritable value = record.getSecond();
+ Kluster newCluster = new Kluster(value.get(), nextClusterId++, measure);
+ newCluster.observe(value.get(), 1);
+ Text newText = new Text(key.toString());
+ int currentSize = chosenTexts.size();
+ if (currentSize < k) {
+ chosenTexts.add(newText);
+ ClusterWritable clusterWritable = new ClusterWritable();
+ clusterWritable.setValue(newCluster);
+ chosenClusters.add(clusterWritable);
+ } else {
+ int j = random.nextInt(index);
+ if (j < k) {
+ chosenTexts.set(j, newText);
+ ClusterWritable clusterWritable = new ClusterWritable();
+ clusterWritable.setValue(newCluster);
+ chosenClusters.set(j, clusterWritable);
+ }
+ }
+ index++;
+ }
+ }
+ }
+
+ try (SequenceFile.Writer writer =
+ SequenceFile.createWriter(fs, conf, outFile, Text.class, ClusterWritable.class)){
+ for (int i = 0; i < chosenTexts.size(); i++) {
+ writer.append(chosenTexts.get(i), chosenClusters.get(i));
+ }
+ log.info("Wrote {} Klusters to {}", k, outFile);
+ }
+ }
+
+ return outFile;
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/kmeans/package-info.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/kmeans/package-info.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/kmeans/package-info.java
new file mode 100644
index 0000000..d6921b6
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/kmeans/package-info.java
@@ -0,0 +1,5 @@
+/**
+ * This package provides an implementation of the <a href="http://en.wikipedia.org/wiki/Kmeans">k-means</a> clustering
+ * algorithm.
+ */
+package org.apache.mahout.clustering.kmeans;

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/lda/cvb/CVB0DocInferenceMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/lda/cvb/CVB0DocInferenceMapper.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/lda/cvb/CVB0DocInferenceMapper.java
new file mode 100644
index 0000000..46fcc7f
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/lda/cvb/CVB0DocInferenceMapper.java
@@ -0,0 +1,51 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.clustering.lda.cvb;
+
+import org.apache.hadoop.io.IntWritable;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.Matrix;
+import org.apache.mahout.math.SparseRowMatrix;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+
+import java.io.IOException;
+
+public class CVB0DocInferenceMapper extends CachingCVB0Mapper {
+
+ private final VectorWritable topics = new VectorWritable();
+
+ @Override
+ public void map(IntWritable docId, VectorWritable doc, Context context)
+ throws IOException, InterruptedException {
+ int numTopics = getNumTopics();
+ Vector docTopics = new DenseVector(numTopics).assign(1.0 / numTopics);
+ Matrix docModel = new SparseRowMatrix(numTopics, doc.get().size());
+ int maxIters = getMaxIters();
+ ModelTrainer modelTrainer = getModelTrainer();
+ for (int i = 0; i < maxIters; i++) {
+ modelTrainer.getReadModel().trainDocTopicModel(doc.get(), docTopics, docModel);
+ }
+ topics.set(docTopics);
+ context.write(docId, topics);
+ }
+
+ @Override
+ protected void cleanup(Context context) {
+ getModelTrainer().stop();
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/lda/cvb/CVB0Driver.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/lda/cvb/CVB0Driver.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/lda/cvb/CVB0Driver.java
new file mode 100644
index 0000000..31c0d60
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/lda/cvb/CVB0Driver.java
@@ -0,0 +1,536 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.clustering.lda.cvb;
+
+import java.io.IOException;
+import java.net.URI;
+import java.util.ArrayList;
+import java.util.List;
+
+import com.google.common.base.Joiner;
+import com.google.common.base.Preconditions;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.filecache.DistributedCache;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.DoubleWritable;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
+import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
+import org.apache.mahout.common.mapreduce.VectorSumReducer;
+import org.apache.mahout.math.VectorWritable;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * See {@link CachingCVB0Mapper} for more details on scalability and room for improvement.
+ * To try out this LDA implementation without using Hadoop, check out
+ * {@link InMemoryCollapsedVariationalBayes0}. If you want to do training directly in java code
+ * with your own main(), then look to {@link ModelTrainer} and {@link TopicModel}.
+ *
+ * Usage: {@code ./bin/mahout cvb <i>options</i>}
+ * <p>
+ * Valid options include:
+ * <dl>
+ * <dt>{@code --input path}</td>
+ * <dd>Input path for {@code SequenceFile<IntWritable, VectorWritable>} document vectors. See
+ * {@link org.apache.mahout.vectorizer.SparseVectorsFromSequenceFiles}
+ * for details on how to generate this input format.</dd>
+ * <dt>{@code --dictionary path}</dt>
+ * <dd>Path to dictionary file(s) generated during construction of input document vectors (glob
+ * expression supported). If set, this data is scanned to determine an appropriate value for option
+ * {@code --num_terms}.</dd>
+ * <dt>{@code --output path}</dt>
+ * <dd>Output path for topic-term distributions.</dd>
+ * <dt>{@code --doc_topic_output path}</dt>
+ * <dd>Output path for doc-topic distributions.</dd>
+ * <dt>{@code --num_topics k}</dt>
+ * <dd>Number of latent topics.</dd>
+ * <dt>{@code --num_terms nt}</dt>
+ * <dd>Number of unique features defined by input document vectors. If option {@code --dictionary}
+ * is defined and this option is unspecified, term count is calculated from dictionary.</dd>
+ * <dt>{@code --topic_model_temp_dir path}</dt>
+ * <dd>Path in which to store model state after each iteration.</dd>
+ * <dt>{@code --maxIter i}</dt>
+ * <dd>Maximum number of iterations to perform. If this value is less than or equal to the number of
+ * iteration states found beneath the path specified by option {@code --topic_model_temp_dir}, no
+ * further iterations are performed. Instead, output topic-term and doc-topic distributions are
+ * generated using data from the specified iteration.</dd>
+ * <dt>{@code --max_doc_topic_iters i}</dt>
+ * <dd>Maximum number of iterations per doc for p(topic|doc) learning. Defaults to {@code 10}.</dd>
+ * <dt>{@code --doc_topic_smoothing a}</dt>
+ * <dd>Smoothing for doc-topic distribution. Defaults to {@code 0.0001}.</dd>
+ * <dt>{@code --term_topic_smoothing e}</dt>
+ * <dd>Smoothing for topic-term distribution. Defaults to {@code 0.0001}.</dd>
+ * <dt>{@code --random_seed seed}</dt>
+ * <dd>Integer seed for random number generation.</dd>
+ * <dt>{@code --test_set_percentage p}</dt>
+ * <dd>Fraction of data to hold out for testing. Defaults to {@code 0.0}.</dd>
+ * <dt>{@code --iteration_block_size block}</dt>
+ * <dd>Number of iterations between perplexity checks. Defaults to {@code 10}. This option is
+ * ignored unless option {@code --test_set_percentage} is greater than zero.</dd>
+ * </dl>
+ */
+public class CVB0Driver extends AbstractJob {
+ private static final Logger log = LoggerFactory.getLogger(CVB0Driver.class);
+
+ public static final String NUM_TOPICS = "num_topics";
+ public static final String NUM_TERMS = "num_terms";
+ public static final String DOC_TOPIC_SMOOTHING = "doc_topic_smoothing";
+ public static final String TERM_TOPIC_SMOOTHING = "term_topic_smoothing";
+ public static final String DICTIONARY = "dictionary";
+ public static final String DOC_TOPIC_OUTPUT = "doc_topic_output";
+ public static final String MODEL_TEMP_DIR = "topic_model_temp_dir";
+ public static final String ITERATION_BLOCK_SIZE = "iteration_block_size";
+ public static final String RANDOM_SEED = "random_seed";
+ public static final String TEST_SET_FRACTION = "test_set_fraction";
+ public static final String NUM_TRAIN_THREADS = "num_train_threads";
+ public static final String NUM_UPDATE_THREADS = "num_update_threads";
+ public static final String MAX_ITERATIONS_PER_DOC = "max_doc_topic_iters";
+ public static final String MODEL_WEIGHT = "prev_iter_mult";
+ public static final String NUM_REDUCE_TASKS = "num_reduce_tasks";
+ public static final String BACKFILL_PERPLEXITY = "backfill_perplexity";
+ private static final String MODEL_PATHS = "mahout.lda.cvb.modelPath";
+
+ private static final double DEFAULT_CONVERGENCE_DELTA = 0;
+ private static final double DEFAULT_DOC_TOPIC_SMOOTHING = 0.0001;
+ private static final double DEFAULT_TERM_TOPIC_SMOOTHING = 0.0001;
+ private static final int DEFAULT_ITERATION_BLOCK_SIZE = 10;
+ private static final double DEFAULT_TEST_SET_FRACTION = 0;
+ private static final int DEFAULT_NUM_TRAIN_THREADS = 4;
+ private static final int DEFAULT_NUM_UPDATE_THREADS = 1;
+ private static final int DEFAULT_MAX_ITERATIONS_PER_DOC = 10;
+ private static final int DEFAULT_NUM_REDUCE_TASKS = 10;
+
+ @Override
+ public int run(String[] args) throws Exception {
+ addInputOption();
+ addOutputOption();
+ addOption(DefaultOptionCreator.maxIterationsOption().create());
+ addOption(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION, "cd", "The convergence delta value",
+ String.valueOf(DEFAULT_CONVERGENCE_DELTA));
+ addOption(DefaultOptionCreator.overwriteOption().create());
+
+ addOption(NUM_TOPICS, "k", "Number of topics to learn", true);
+ addOption(NUM_TERMS, "nt", "Vocabulary size", false);
+ addOption(DOC_TOPIC_SMOOTHING, "a", "Smoothing for document/topic distribution",
+ String.valueOf(DEFAULT_DOC_TOPIC_SMOOTHING));
+ addOption(TERM_TOPIC_SMOOTHING, "e", "Smoothing for topic/term distribution",
+ String.valueOf(DEFAULT_TERM_TOPIC_SMOOTHING));
+ addOption(DICTIONARY, "dict", "Path to term-dictionary file(s) (glob expression supported)", false);
+ addOption(DOC_TOPIC_OUTPUT, "dt", "Output path for the training doc/topic distribution", false);
+ addOption(MODEL_TEMP_DIR, "mt", "Path to intermediate model path (useful for restarting)", false);
+ addOption(ITERATION_BLOCK_SIZE, "block", "Number of iterations per perplexity check",
+ String.valueOf(DEFAULT_ITERATION_BLOCK_SIZE));
+ addOption(RANDOM_SEED, "seed", "Random seed", false);
+ addOption(TEST_SET_FRACTION, "tf", "Fraction of data to hold out for testing",
+ String.valueOf(DEFAULT_TEST_SET_FRACTION));
+ addOption(NUM_TRAIN_THREADS, "ntt", "number of threads per mapper to train with",
+ String.valueOf(DEFAULT_NUM_TRAIN_THREADS));
+ addOption(NUM_UPDATE_THREADS, "nut", "number of threads per mapper to update the model with",
+ String.valueOf(DEFAULT_NUM_UPDATE_THREADS));
+ addOption(MAX_ITERATIONS_PER_DOC, "mipd", "max number of iterations per doc for p(topic|doc) learning",
+ String.valueOf(DEFAULT_MAX_ITERATIONS_PER_DOC));
+ addOption(NUM_REDUCE_TASKS, null, "number of reducers to use during model estimation",
+ String.valueOf(DEFAULT_NUM_REDUCE_TASKS));
+ addOption(buildOption(BACKFILL_PERPLEXITY, null, "enable backfilling of missing perplexity values", false, false,
+ null));
+
+ if (parseArguments(args) == null) {
+ return -1;
+ }
+
+ int numTopics = Integer.parseInt(getOption(NUM_TOPICS));
+ Path inputPath = getInputPath();
+ Path topicModelOutputPath = getOutputPath();
+ int maxIterations = Integer.parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION));
+ int iterationBlockSize = Integer.parseInt(getOption(ITERATION_BLOCK_SIZE));
+ double convergenceDelta = Double.parseDouble(getOption(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION));
+ double alpha = Double.parseDouble(getOption(DOC_TOPIC_SMOOTHING));
+ double eta = Double.parseDouble(getOption(TERM_TOPIC_SMOOTHING));
+ int numTrainThreads = Integer.parseInt(getOption(NUM_TRAIN_THREADS));
+ int numUpdateThreads = Integer.parseInt(getOption(NUM_UPDATE_THREADS));
+ int maxItersPerDoc = Integer.parseInt(getOption(MAX_ITERATIONS_PER_DOC));
+ Path dictionaryPath = hasOption(DICTIONARY) ? new Path(getOption(DICTIONARY)) : null;
+ int numTerms = hasOption(NUM_TERMS)
+ ? Integer.parseInt(getOption(NUM_TERMS))
+ : getNumTerms(getConf(), dictionaryPath);
+ Path docTopicOutputPath = hasOption(DOC_TOPIC_OUTPUT) ? new Path(getOption(DOC_TOPIC_OUTPUT)) : null;
+ Path modelTempPath = hasOption(MODEL_TEMP_DIR)
+ ? new Path(getOption(MODEL_TEMP_DIR))
+ : getTempPath("topicModelState");
+ long seed = hasOption(RANDOM_SEED)
+ ? Long.parseLong(getOption(RANDOM_SEED))
+ : System.nanoTime() % 10000;
+ float testFraction = hasOption(TEST_SET_FRACTION)
+ ? Float.parseFloat(getOption(TEST_SET_FRACTION))
+ : 0.0f;
+ int numReduceTasks = Integer.parseInt(getOption(NUM_REDUCE_TASKS));
+ boolean backfillPerplexity = hasOption(BACKFILL_PERPLEXITY);
+
+ return run(getConf(), inputPath, topicModelOutputPath, numTopics, numTerms, alpha, eta,
+ maxIterations, iterationBlockSize, convergenceDelta, dictionaryPath, docTopicOutputPath,
+ modelTempPath, seed, testFraction, numTrainThreads, numUpdateThreads, maxItersPerDoc,
+ numReduceTasks, backfillPerplexity);
+ }
+
+ private static int getNumTerms(Configuration conf, Path dictionaryPath) throws IOException {
+ FileSystem fs = dictionaryPath.getFileSystem(conf);
+ Text key = new Text();
+ IntWritable value = new IntWritable();
+ int maxTermId = -1;
+ for (FileStatus stat : fs.globStatus(dictionaryPath)) {
+ SequenceFile.Reader reader = new SequenceFile.Reader(fs, stat.getPath(), conf);
+ while (reader.next(key, value)) {
+ maxTermId = Math.max(maxTermId, value.get());
+ }
+ }
+ return maxTermId + 1;
+ }
+
+ public int run(Configuration conf,
+ Path inputPath,
+ Path topicModelOutputPath,
+ int numTopics,
+ int numTerms,
+ double alpha,
+ double eta,
+ int maxIterations,
+ int iterationBlockSize,
+ double convergenceDelta,
+ Path dictionaryPath,
+ Path docTopicOutputPath,
+ Path topicModelStateTempPath,
+ long randomSeed,
+ float testFraction,
+ int numTrainThreads,
+ int numUpdateThreads,
+ int maxItersPerDoc,
+ int numReduceTasks,
+ boolean backfillPerplexity)
+ throws ClassNotFoundException, IOException, InterruptedException {
+
+ setConf(conf);
+
+ // verify arguments
+ Preconditions.checkArgument(testFraction >= 0.0 && testFraction <= 1.0,
+ "Expected 'testFraction' value in range [0, 1] but found value '%s'", testFraction);
+ Preconditions.checkArgument(!backfillPerplexity || testFraction > 0.0,
+ "Expected 'testFraction' value in range (0, 1] but found value '%s'", testFraction);
+
+ String infoString = "Will run Collapsed Variational Bayes (0th-derivative approximation) "
+ + "learning for LDA on {} (numTerms: {}), finding {}-topics, with document/topic prior {}, "
+ + "topic/term prior {}. Maximum iterations to run will be {}, unless the change in "
+ + "perplexity is less than {}. Topic model output (p(term|topic) for each topic) will be "
+ + "stored {}. Random initialization seed is {}, holding out {} of the data for perplexity "
+ + "check\n";
+ log.info(infoString, inputPath, numTerms, numTopics, alpha, eta, maxIterations,
+ convergenceDelta, topicModelOutputPath, randomSeed, testFraction);
+ infoString = dictionaryPath == null
+ ? "" : "Dictionary to be used located " + dictionaryPath.toString() + '\n';
+ infoString += docTopicOutputPath == null
+ ? "" : "p(topic|docId) will be stored " + docTopicOutputPath.toString() + '\n';
+ log.info(infoString);
+
+ FileSystem fs = FileSystem.get(topicModelStateTempPath.toUri(), conf);
+ int iterationNumber = getCurrentIterationNumber(conf, topicModelStateTempPath, maxIterations);
+ log.info("Current iteration number: {}", iterationNumber);
+
+ conf.set(NUM_TOPICS, String.valueOf(numTopics));
+ conf.set(NUM_TERMS, String.valueOf(numTerms));
+ conf.set(DOC_TOPIC_SMOOTHING, String.valueOf(alpha));
+ conf.set(TERM_TOPIC_SMOOTHING, String.valueOf(eta));
+ conf.set(RANDOM_SEED, String.valueOf(randomSeed));
+ conf.set(NUM_TRAIN_THREADS, String.valueOf(numTrainThreads));
+ conf.set(NUM_UPDATE_THREADS, String.valueOf(numUpdateThreads));
+ conf.set(MAX_ITERATIONS_PER_DOC, String.valueOf(maxItersPerDoc));
+ conf.set(MODEL_WEIGHT, "1"); // TODO
+ conf.set(TEST_SET_FRACTION, String.valueOf(testFraction));
+
+ List<Double> perplexities = new ArrayList<>();
+ for (int i = 1; i <= iterationNumber; i++) {
+ // form path to model
+ Path modelPath = modelPath(topicModelStateTempPath, i);
+
+ // read perplexity
+ double perplexity = readPerplexity(conf, topicModelStateTempPath, i);
+ if (Double.isNaN(perplexity)) {
+ if (!(backfillPerplexity && i % iterationBlockSize == 0)) {
+ continue;
+ }
+ log.info("Backfilling perplexity at iteration {}", i);
+ if (!fs.exists(modelPath)) {
+ log.error("Model path '{}' does not exist; Skipping iteration {} perplexity calculation",
+ modelPath.toString(), i);
+ continue;
+ }
+ perplexity = calculatePerplexity(conf, inputPath, modelPath, i);
+ }
+
+ // register and log perplexity
+ perplexities.add(perplexity);
+ log.info("Perplexity at iteration {} = {}", i, perplexity);
+ }
+
+ long startTime = System.currentTimeMillis();
+ while (iterationNumber < maxIterations) {
+ // test convergence
+ if (convergenceDelta > 0.0) {
+ double delta = rateOfChange(perplexities);
+ if (delta < convergenceDelta) {
+ log.info("Convergence achieved at iteration {} with perplexity {} and delta {}",
+ iterationNumber, perplexities.get(perplexities.size() - 1), delta);
+ break;
+ }
+ }
+
+ // update model
+ iterationNumber++;
+ log.info("About to run iteration {} of {}", iterationNumber, maxIterations);
+ Path modelInputPath = modelPath(topicModelStateTempPath, iterationNumber - 1);
+ Path modelOutputPath = modelPath(topicModelStateTempPath, iterationNumber);
+ runIteration(conf, inputPath, modelInputPath, modelOutputPath, iterationNumber,
+ maxIterations, numReduceTasks);
+
+ // calculate perplexity
+ if (testFraction > 0 && iterationNumber % iterationBlockSize == 0) {
+ perplexities.add(calculatePerplexity(conf, inputPath, modelOutputPath, iterationNumber));
+ log.info("Current perplexity = {}", perplexities.get(perplexities.size() - 1));
+ log.info("(p_{} - p_{}) / p_0 = {}; target = {}", iterationNumber, iterationNumber - iterationBlockSize,
+ rateOfChange(perplexities), convergenceDelta);
+ }
+ }
+ log.info("Completed {} iterations in {} seconds", iterationNumber,
+ (System.currentTimeMillis() - startTime) / 1000);
+ log.info("Perplexities: ({})", Joiner.on(", ").join(perplexities));
+
+ // write final topic-term and doc-topic distributions
+ Path finalIterationData = modelPath(topicModelStateTempPath, iterationNumber);
+ Job topicModelOutputJob = topicModelOutputPath != null
+ ? writeTopicModel(conf, finalIterationData, topicModelOutputPath)
+ : null;
+ Job docInferenceJob = docTopicOutputPath != null
+ ? writeDocTopicInference(conf, inputPath, finalIterationData, docTopicOutputPath)
+ : null;
+ if (topicModelOutputJob != null && !topicModelOutputJob.waitForCompletion(true)) {
+ return -1;
+ }
+ if (docInferenceJob != null && !docInferenceJob.waitForCompletion(true)) {
+ return -1;
+ }
+ return 0;
+ }
+
+ private static double rateOfChange(List<Double> perplexities) {
+ int sz = perplexities.size();
+ if (sz < 2) {
+ return Double.MAX_VALUE;
+ }
+ return Math.abs(perplexities.get(sz - 1) - perplexities.get(sz - 2)) / perplexities.get(0);
+ }
+
+ private double calculatePerplexity(Configuration conf, Path corpusPath, Path modelPath, int iteration)
+ throws IOException, ClassNotFoundException, InterruptedException {
+ String jobName = "Calculating perplexity for " + modelPath;
+ log.info("About to run: {}", jobName);
+
+ Path outputPath = perplexityPath(modelPath.getParent(), iteration);
+ Job job = prepareJob(corpusPath, outputPath, CachingCVB0PerplexityMapper.class, DoubleWritable.class,
+ DoubleWritable.class, DualDoubleSumReducer.class, DoubleWritable.class, DoubleWritable.class);
+
+ job.setJobName(jobName);
+ job.setCombinerClass(DualDoubleSumReducer.class);
+ job.setNumReduceTasks(1);
+ setModelPaths(job, modelPath);
+ HadoopUtil.delete(conf, outputPath);
+ if (!job.waitForCompletion(true)) {
+ throw new InterruptedException("Failed to calculate perplexity for: " + modelPath);
+ }
+ return readPerplexity(conf, modelPath.getParent(), iteration);
+ }
+
+ /**
+ * Sums keys and values independently.
+ */
+ public static class DualDoubleSumReducer extends
+ Reducer<DoubleWritable, DoubleWritable, DoubleWritable, DoubleWritable> {
+ private final DoubleWritable outKey = new DoubleWritable();
+ private final DoubleWritable outValue = new DoubleWritable();
+
+ @Override
+ public void run(Context context) throws IOException,
+ InterruptedException {
+ double keySum = 0.0;
+ double valueSum = 0.0;
+ while (context.nextKey()) {
+ keySum += context.getCurrentKey().get();
+ for (DoubleWritable value : context.getValues()) {
+ valueSum += value.get();
+ }
+ }
+ outKey.set(keySum);
+ outValue.set(valueSum);
+ context.write(outKey, outValue);
+ }
+ }
+
+ /**
+ * @param topicModelStateTemp
+ * @param iteration
+ * @return {@code double[2]} where first value is perplexity and second is model weight of those
+ * documents sampled during perplexity computation, or {@code null} if no perplexity data
+ * exists for the given iteration.
+ * @throws IOException
+ */
+ public static double readPerplexity(Configuration conf, Path topicModelStateTemp, int iteration)
+ throws IOException {
+ Path perplexityPath = perplexityPath(topicModelStateTemp, iteration);
+ FileSystem fs = FileSystem.get(perplexityPath.toUri(), conf);
+ if (!fs.exists(perplexityPath)) {
+ log.warn("Perplexity path {} does not exist, returning NaN", perplexityPath);
+ return Double.NaN;
+ }
+ double perplexity = 0;
+ double modelWeight = 0;
+ long n = 0;
+ for (Pair<DoubleWritable, DoubleWritable> pair : new SequenceFileDirIterable<DoubleWritable, DoubleWritable>(
+ perplexityPath, PathType.LIST, PathFilters.partFilter(), null, true, conf)) {
+ modelWeight += pair.getFirst().get();
+ perplexity += pair.getSecond().get();
+ n++;
+ }
+ log.info("Read {} entries with total perplexity {} and model weight {}", n,
+ perplexity, modelWeight);
+ return perplexity / modelWeight;
+ }
+
+ private Job writeTopicModel(Configuration conf, Path modelInput, Path output)
+ throws IOException, InterruptedException, ClassNotFoundException {
+ String jobName = String.format("Writing final topic/term distributions from %s to %s", modelInput, output);
+ log.info("About to run: {}", jobName);
+
+ Job job = prepareJob(modelInput, output, SequenceFileInputFormat.class, CVB0TopicTermVectorNormalizerMapper.class,
+ IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class, jobName);
+ job.submit();
+ return job;
+ }
+
+ private Job writeDocTopicInference(Configuration conf, Path corpus, Path modelInput, Path output)
+ throws IOException, ClassNotFoundException, InterruptedException {
+ String jobName = String.format("Writing final document/topic inference from %s to %s", corpus, output);
+ log.info("About to run: {}", jobName);
+
+ Job job = prepareJob(corpus, output, SequenceFileInputFormat.class, CVB0DocInferenceMapper.class,
+ IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class, jobName);
+
+ FileSystem fs = FileSystem.get(corpus.toUri(), conf);
+ if (modelInput != null && fs.exists(modelInput)) {
+ FileStatus[] statuses = fs.listStatus(modelInput, PathFilters.partFilter());
+ URI[] modelUris = new URI[statuses.length];
+ for (int i = 0; i < statuses.length; i++) {
+ modelUris[i] = statuses[i].getPath().toUri();
+ }
+ DistributedCache.setCacheFiles(modelUris, conf);
+ setModelPaths(job, modelInput);
+ }
+ job.submit();
+ return job;
+ }
+
+ public static Path modelPath(Path topicModelStateTempPath, int iterationNumber) {
+ return new Path(topicModelStateTempPath, "model-" + iterationNumber);
+ }
+
+ public static Path perplexityPath(Path topicModelStateTempPath, int iterationNumber) {
+ return new Path(topicModelStateTempPath, "perplexity-" + iterationNumber);
+ }
+
+ private static int getCurrentIterationNumber(Configuration config, Path modelTempDir, int maxIterations)
+ throws IOException {
+ FileSystem fs = FileSystem.get(modelTempDir.toUri(), config);
+ int iterationNumber = 1;
+ Path iterationPath = modelPath(modelTempDir, iterationNumber);
+ while (fs.exists(iterationPath) && iterationNumber <= maxIterations) {
+ log.info("Found previous state: {}", iterationPath);
+ iterationNumber++;
+ iterationPath = modelPath(modelTempDir, iterationNumber);
+ }
+ return iterationNumber - 1;
+ }
+
+ public void runIteration(Configuration conf, Path corpusInput, Path modelInput, Path modelOutput,
+ int iterationNumber, int maxIterations, int numReduceTasks)
+ throws IOException, ClassNotFoundException, InterruptedException {
+ String jobName = String.format("Iteration %d of %d, input path: %s",
+ iterationNumber, maxIterations, modelInput);
+ log.info("About to run: {}", jobName);
+ Job job = prepareJob(corpusInput, modelOutput, CachingCVB0Mapper.class, IntWritable.class, VectorWritable.class,
+ VectorSumReducer.class, IntWritable.class, VectorWritable.class);
+ job.setCombinerClass(VectorSumReducer.class);
+ job.setNumReduceTasks(numReduceTasks);
+ job.setJobName(jobName);
+ setModelPaths(job, modelInput);
+ HadoopUtil.delete(conf, modelOutput);
+ if (!job.waitForCompletion(true)) {
+ throw new InterruptedException(String.format("Failed to complete iteration %d stage 1",
+ iterationNumber));
+ }
+ }
+
+ private static void setModelPaths(Job job, Path modelPath) throws IOException {
+ Configuration conf = job.getConfiguration();
+ if (modelPath == null || !FileSystem.get(modelPath.toUri(), conf).exists(modelPath)) {
+ return;
+ }
+ FileStatus[] statuses = FileSystem.get(modelPath.toUri(), conf).listStatus(modelPath, PathFilters.partFilter());
+ Preconditions.checkState(statuses.length > 0, "No part files found in model path '%s'", modelPath.toString());
+ String[] modelPaths = new String[statuses.length];
+ for (int i = 0; i < statuses.length; i++) {
+ modelPaths[i] = statuses[i].getPath().toUri().toString();
+ }
+ conf.setStrings(MODEL_PATHS, modelPaths);
+ }
+
+ public static Path[] getModelPaths(Configuration conf) {
+ String[] modelPathNames = conf.getStrings(MODEL_PATHS);
+ if (modelPathNames == null || modelPathNames.length == 0) {
+ return null;
+ }
+ Path[] modelPaths = new Path[modelPathNames.length];
+ for (int i = 0; i < modelPathNames.length; i++) {
+ modelPaths[i] = new Path(modelPathNames[i]);
+ }
+ return modelPaths;
+ }
+
+ public static void main(String[] args) throws Exception {
+ ToolRunner.run(new Configuration(), new CVB0Driver(), args);
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/lda/cvb/CVB0TopicTermVectorNormalizerMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/lda/cvb/CVB0TopicTermVectorNormalizerMapper.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/lda/cvb/CVB0TopicTermVectorNormalizerMapper.java
new file mode 100644
index 0000000..1253942
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/lda/cvb/CVB0TopicTermVectorNormalizerMapper.java
@@ -0,0 +1,38 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.clustering.lda.cvb;
+
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.math.VectorWritable;
+import org.apache.mahout.math.function.Functions;
+
+import java.io.IOException;
+
+/**
+ * Performs L1 normalization of input vectors.
+ */
+public class CVB0TopicTermVectorNormalizerMapper extends
+ Mapper<IntWritable, VectorWritable, IntWritable, VectorWritable> {
+
+ @Override
+ protected void map(IntWritable key, VectorWritable value, Context context) throws IOException,
+ InterruptedException {
+ value.get().assign(Functions.div(value.get().norm(1.0)));
+ context.write(key, value);
+ }
+}
r***@apache.org
2018-06-28 14:54:37 UTC
Permalink
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/RecordFactory.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/RecordFactory.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/RecordFactory.java
new file mode 100644
index 0000000..fbc825d
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/RecordFactory.java
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import org.apache.mahout.math.Vector;
+
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+/**
+ * A record factor understands how to convert a line of data into fields and then into a vector.
+ */
+public interface RecordFactory {
+ void defineTargetCategories(List<String> values);
+
+ RecordFactory maxTargetValue(int max);
+
+ boolean usesFirstLineAsSchema();
+
+ int processLine(String line, Vector featureVector);
+
+ Iterable<String> getPredictors();
+
+ Map<String, Set<Integer>> getTraceDictionary();
+
+ RecordFactory includeBiasTerm(boolean useBias);
+
+ List<String> getTargetCategories();
+
+ void firstLine(String line);
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/TPrior.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/TPrior.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/TPrior.java
new file mode 100644
index 0000000..0a7b6a7
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/TPrior.java
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import org.apache.commons.math3.special.Gamma;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+/**
+ * Provides a t-distribution as a prior.
+ */
+public class TPrior implements PriorFunction {
+ private double df;
+
+ public TPrior(double df) {
+ this.df = df;
+ }
+
+ @Override
+ public double age(double oldValue, double generations, double learningRate) {
+ for (int i = 0; i < generations; i++) {
+ oldValue -= learningRate * oldValue * (df + 1.0) / (df + oldValue * oldValue);
+ }
+ return oldValue;
+ }
+
+ @Override
+ public double logP(double betaIJ) {
+ return Gamma.logGamma((df + 1.0) / 2.0)
+ - Math.log(df * Math.PI)
+ - Gamma.logGamma(df / 2.0)
+ - (df + 1.0) / 2.0 * Math.log1p(betaIJ * betaIJ);
+ }
+
+ @Override
+ public void write(DataOutput out) throws IOException {
+ out.writeDouble(df);
+ }
+
+ @Override
+ public void readFields(DataInput in) throws IOException {
+ df = in.readDouble();
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/UniformPrior.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/UniformPrior.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/UniformPrior.java
new file mode 100644
index 0000000..23c812f
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/UniformPrior.java
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+/**
+ * A uniform prior. This is an improper prior that corresponds to no regularization at all.
+ */
+public class UniformPrior implements PriorFunction {
+ @Override
+ public double age(double oldValue, double generations, double learningRate) {
+ return oldValue;
+ }
+
+ @Override
+ public double logP(double betaIJ) {
+ return 0;
+ }
+
+ @Override
+ public void write(DataOutput dataOutput) throws IOException {
+ // nothing to write
+ }
+
+ @Override
+ public void readFields(DataInput dataInput) throws IOException {
+ // stateless class is trivial to read
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/package-info.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/package-info.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/package-info.java
new file mode 100644
index 0000000..c2ad966
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/package-info.java
@@ -0,0 +1,23 @@
+/**
+ * <p>Implements a variety of on-line logistric regression classifiers using SGD-based algorithms.
+ * SGD stands for Stochastic Gradient Descent and refers to a class of learning algorithms
+ * that make it relatively easy to build high speed on-line learning algorithms for a variety
+ * of problems, notably including supervised learning for classification.</p>
+ *
+ * <p>The primary class of interest in the this package is
+ * {@link org.apache.mahout.classifier.sgd.CrossFoldLearner} which contains a
+ * number (typically 5) of sub-learners, each of which is given a different portion of the
+ * training data. Each of these sub-learners can then be evaluated on the data it was not
+ * trained on. This allows fully incremental learning while still getting cross-validated
+ * performance estimates.</p>
+ *
+ * <p>The CrossFoldLearner implements {@link org.apache.mahout.classifier.OnlineLearner}
+ * and thus expects to be fed input in the form
+ * of a target variable and a feature vector. The target variable is simply an integer in the
+ * half-open interval [0..numFeatures) where numFeatures is defined when the CrossFoldLearner
+ * is constructed. The creation of feature vectors is facilitated by the classes that inherit
+ * from {@link org.apache.mahout.vectorizer.encoders.FeatureVectorEncoder}.
+ * These classes currently implement a form of feature hashing with
+ * multiple probes to limit feature ambiguity.</p>
+ */
+package org.apache.mahout.classifier.sgd;

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/AbstractCluster.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/AbstractCluster.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/AbstractCluster.java
new file mode 100644
index 0000000..be7ed2a
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/AbstractCluster.java
@@ -0,0 +1,390 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.mahout.common.parameters.Parameter;
+import org.apache.mahout.math.RandomAccessSparseVector;
+import org.apache.mahout.math.SequentialAccessSparseVector;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.Vector.Element;
+import org.apache.mahout.math.VectorWritable;
+import org.apache.mahout.math.function.Functions;
+import org.apache.mahout.math.function.SquareRootFunction;
+import org.codehaus.jackson.map.ObjectMapper;
+
+public abstract class AbstractCluster implements Cluster {
+
+ // cluster persistent state
+ private int id;
+
+ private long numObservations;
+
+ private long totalObservations;
+
+ private Vector center;
+
+ private Vector radius;
+
+ // the observation statistics
+ private double s0;
+
+ private Vector s1;
+
+ private Vector s2;
+
+ private static final ObjectMapper jxn = new ObjectMapper();
+
+ protected AbstractCluster() {}
+
+ protected AbstractCluster(Vector point, int id2) {
+ this.numObservations = (long) 0;
+ this.totalObservations = (long) 0;
+ this.center = point.clone();
+ this.radius = center.like();
+ this.s0 = (double) 0;
+ this.s1 = center.like();
+ this.s2 = center.like();
+ this.id = id2;
+ }
+
+ protected AbstractCluster(Vector center2, Vector radius2, int id2) {
+ this.numObservations = (long) 0;
+ this.totalObservations = (long) 0;
+ this.center = new RandomAccessSparseVector(center2);
+ this.radius = new RandomAccessSparseVector(radius2);
+ this.s0 = (double) 0;
+ this.s1 = center.like();
+ this.s2 = center.like();
+ this.id = id2;
+ }
+
+ @Override
+ public void write(DataOutput out) throws IOException {
+ out.writeInt(id);
+ out.writeLong(getNumObservations());
+ out.writeLong(getTotalObservations());
+ VectorWritable.writeVector(out, getCenter());
+ VectorWritable.writeVector(out, getRadius());
+ out.writeDouble(s0);
+ VectorWritable.writeVector(out, s1);
+ VectorWritable.writeVector(out, s2);
+ }
+
+ @Override
+ public void readFields(DataInput in) throws IOException {
+ this.id = in.readInt();
+ this.setNumObservations(in.readLong());
+ this.setTotalObservations(in.readLong());
+ this.setCenter(VectorWritable.readVector(in));
+ this.setRadius(VectorWritable.readVector(in));
+ this.setS0(in.readDouble());
+ this.setS1(VectorWritable.readVector(in));
+ this.setS2(VectorWritable.readVector(in));
+ }
+
+ @Override
+ public void configure(Configuration job) {
+ // nothing to do
+ }
+
+ @Override
+ public Collection<Parameter<?>> getParameters() {
+ return Collections.emptyList();
+ }
+
+ @Override
+ public void createParameters(String prefix, Configuration jobConf) {
+ // nothing to do
+ }
+
+ @Override
+ public int getId() {
+ return id;
+ }
+
+ /**
+ * @param id
+ * the id to set
+ */
+ protected void setId(int id) {
+ this.id = id;
+ }
+
+ @Override
+ public long getNumObservations() {
+ return numObservations;
+ }
+
+ /**
+ * @param l
+ * the numPoints to set
+ */
+ protected void setNumObservations(long l) {
+ this.numObservations = l;
+ }
+
+ @Override
+ public long getTotalObservations() {
+ return totalObservations;
+ }
+
+ protected void setTotalObservations(long totalPoints) {
+ this.totalObservations = totalPoints;
+ }
+
+ @Override
+ public Vector getCenter() {
+ return center;
+ }
+
+ /**
+ * @param center
+ * the center to set
+ */
+ protected void setCenter(Vector center) {
+ this.center = center;
+ }
+
+ @Override
+ public Vector getRadius() {
+ return radius;
+ }
+
+ /**
+ * @param radius
+ * the radius to set
+ */
+ protected void setRadius(Vector radius) {
+ this.radius = radius;
+ }
+
+ /**
+ * @return the s0
+ */
+ protected double getS0() {
+ return s0;
+ }
+
+ protected void setS0(double s0) {
+ this.s0 = s0;
+ }
+
+ /**
+ * @return the s1
+ */
+ protected Vector getS1() {
+ return s1;
+ }
+
+ protected void setS1(Vector s1) {
+ this.s1 = s1;
+ }
+
+ /**
+ * @return the s2
+ */
+ protected Vector getS2() {
+ return s2;
+ }
+
+ protected void setS2(Vector s2) {
+ this.s2 = s2;
+ }
+
+ @Override
+ public void observe(Model<VectorWritable> x) {
+ AbstractCluster cl = (AbstractCluster) x;
+ setS0(getS0() + cl.getS0());
+ setS1(getS1().plus(cl.getS1()));
+ setS2(getS2().plus(cl.getS2()));
+ }
+
+ @Override
+ public void observe(VectorWritable x) {
+ observe(x.get());
+ }
+
+ @Override
+ public void observe(VectorWritable x, double weight) {
+ observe(x.get(), weight);
+ }
+
+ public void observe(Vector x, double weight) {
+ if (weight == 1.0) {
+ observe(x);
+ } else {
+ setS0(getS0() + weight);
+ Vector weightedX = x.times(weight);
+ if (getS1() == null) {
+ setS1(weightedX);
+ } else {
+ getS1().assign(weightedX, Functions.PLUS);
+ }
+ Vector x2 = x.times(x).times(weight);
+ if (getS2() == null) {
+ setS2(x2);
+ } else {
+ getS2().assign(x2, Functions.PLUS);
+ }
+ }
+ }
+
+ public void observe(Vector x) {
+ setS0(getS0() + 1);
+ if (getS1() == null) {
+ setS1(x.clone());
+ } else {
+ getS1().assign(x, Functions.PLUS);
+ }
+ Vector x2 = x.times(x);
+ if (getS2() == null) {
+ setS2(x2);
+ } else {
+ getS2().assign(x2, Functions.PLUS);
+ }
+ }
+
+
+ @Override
+ public void computeParameters() {
+ if (getS0() == 0) {
+ return;
+ }
+ setNumObservations((long) getS0());
+ setTotalObservations(getTotalObservations() + getNumObservations());
+ setCenter(getS1().divide(getS0()));
+ // compute the component stds
+ if (getS0() > 1) {
+ setRadius(getS2().times(getS0()).minus(getS1().times(getS1())).assign(new SquareRootFunction()).divide(getS0()));
+ }
+ setS0(0);
+ setS1(center.like());
+ setS2(center.like());
+ }
+
+ @Override
+ public String asFormatString(String[] bindings) {
+ String fmtString = "";
+ try {
+ fmtString = jxn.writeValueAsString(asJson(bindings));
+ } catch (IOException e) {
+ log.error("Error writing JSON as String.", e);
+ }
+ return fmtString;
+ }
+
+ public Map<String,Object> asJson(String[] bindings) {
+ Map<String,Object> dict = new HashMap<>();
+ dict.put("identifier", getIdentifier());
+ dict.put("n", getNumObservations());
+ if (getCenter() != null) {
+ try {
+ dict.put("c", formatVectorAsJson(getCenter(), bindings));
+ } catch (IOException e) {
+ log.error("IOException: ", e);
+ }
+ }
+ if (getRadius() != null) {
+ try {
+ dict.put("r", formatVectorAsJson(getRadius(), bindings));
+ } catch (IOException e) {
+ log.error("IOException: ", e);
+ }
+ }
+ return dict;
+ }
+
+ public abstract String getIdentifier();
+
+ /**
+ * Compute the centroid by averaging the pointTotals
+ *
+ * @return the new centroid
+ */
+ public Vector computeCentroid() {
+ return getS0() == 0 ? getCenter() : getS1().divide(getS0());
+ }
+
+ /**
+ * Return a human-readable formatted string representation of the vector, not
+ * intended to be complete nor usable as an input/output representation
+ */
+ public static String formatVector(Vector v, String[] bindings) {
+ String fmtString = "";
+ try {
+ fmtString = jxn.writeValueAsString(formatVectorAsJson(v, bindings));
+ } catch (IOException e) {
+ log.error("Error writing JSON as String.", e);
+ }
+ return fmtString;
+ }
+
+ /**
+ * Create a List of HashMaps containing vector terms and weights
+ *
+ * @return List<Object>
+ */
+ public static List<Object> formatVectorAsJson(Vector v, String[] bindings) throws IOException {
+
+ boolean hasBindings = bindings != null;
+ boolean isSparse = v.getNumNonZeroElements() != v.size();
+
+ // we assume sequential access in the output
+ Vector provider = v.isSequentialAccess() ? v : new SequentialAccessSparseVector(v);
+
+ List<Object> terms = new LinkedList<>();
+ String term = "";
+
+ for (Element elem : provider.nonZeroes()) {
+
+ if (hasBindings && bindings.length >= elem.index() + 1 && bindings[elem.index()] != null) {
+ term = bindings[elem.index()];
+ } else if (hasBindings || isSparse) {
+ term = String.valueOf(elem.index());
+ }
+
+ Map<String, Object> term_entry = new HashMap<>();
+ double roundedWeight = (double) Math.round(elem.get() * 1000) / 1000;
+ if (hasBindings || isSparse) {
+ term_entry.put(term, roundedWeight);
+ terms.add(term_entry);
+ } else {
+ terms.add(roundedWeight);
+ }
+ }
+
+ return terms;
+ }
+
+ @Override
+ public boolean isConverged() {
+ // Convergence has no meaning yet, perhaps in subclasses
+ return false;
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/Cluster.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/Cluster.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/Cluster.java
new file mode 100644
index 0000000..07d6927
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/Cluster.java
@@ -0,0 +1,90 @@
+/* Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.clustering;
+
+import org.apache.mahout.common.parameters.Parametered;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+
+import java.util.Map;
+
+/**
+ * Implementations of this interface have a printable representation and certain
+ * attributes that are common across all clustering implementations
+ *
+ */
+public interface Cluster extends Model<VectorWritable>, Parametered {
+
+ // default directory for initial clusters to prime iterative clustering
+ // algorithms
+ String INITIAL_CLUSTERS_DIR = "clusters-0";
+
+ // default directory for output of clusters per iteration
+ String CLUSTERS_DIR = "clusters-";
+
+ // default suffix for output of clusters for final iteration
+ String FINAL_ITERATION_SUFFIX = "-final";
+
+ /**
+ * Get the id of the Cluster
+ *
+ * @return a unique integer
+ */
+ int getId();
+
+ /**
+ * Get the "center" of the Cluster as a Vector
+ *
+ * @return a Vector
+ */
+ Vector getCenter();
+
+ /**
+ * Get the "radius" of the Cluster as a Vector. Usually the radius is the
+ * standard deviation expressed as a Vector of size equal to the center. Some
+ * clusters may return zero values if not appropriate.
+ *
+ * @return aVector
+ */
+ Vector getRadius();
+
+ /**
+ * Produce a custom, human-friendly, printable representation of the Cluster.
+ *
+ * @param bindings
+ * an optional String[] containing labels used to format the primary
+ * Vector/s of this implementation.
+ * @return a String
+ */
+ String asFormatString(String[] bindings);
+
+ /**
+ * Produce a JSON representation of the Cluster.
+ *
+ * @param bindings
+ * an optional String[] containing labels used to format the primary
+ * Vector/s of this implementation.
+ * @return a Map
+ */
+ Map<String,Object> asJson(String[] bindings);
+
+ /**
+ * @return if the receiver has converged, or false if that has no meaning for
+ * the implementation
+ */
+ boolean isConverged();
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/ClusteringUtils.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/ClusteringUtils.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/ClusteringUtils.java
new file mode 100644
index 0000000..ad0f8ec
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/ClusteringUtils.java
@@ -0,0 +1,306 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import com.google.common.base.Preconditions;
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Lists;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
+import org.apache.mahout.math.Centroid;
+import org.apache.mahout.math.DenseMatrix;
+import org.apache.mahout.math.Matrix;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.WeightedVector;
+import org.apache.mahout.math.neighborhood.BruteSearch;
+import org.apache.mahout.math.neighborhood.ProjectionSearch;
+import org.apache.mahout.math.neighborhood.Searcher;
+import org.apache.mahout.math.neighborhood.UpdatableSearcher;
+import org.apache.mahout.math.random.WeightedThing;
+import org.apache.mahout.math.stats.OnlineSummarizer;
+
+public final class ClusteringUtils {
+ private ClusteringUtils() {
+ }
+
+ /**
+ * Computes the summaries for the distances in each cluster.
+ * @param datapoints iterable of datapoints.
+ * @param centroids iterable of Centroids.
+ * @return a list of OnlineSummarizers where the i-th element is the summarizer corresponding to the cluster whose
+ * index is i.
+ */
+ public static List<OnlineSummarizer> summarizeClusterDistances(Iterable<? extends Vector> datapoints,
+ Iterable<? extends Vector> centroids,
+ DistanceMeasure distanceMeasure) {
+ UpdatableSearcher searcher = new ProjectionSearch(distanceMeasure, 3, 1);
+ searcher.addAll(centroids);
+ List<OnlineSummarizer> summarizers = new ArrayList<>();
+ if (searcher.size() == 0) {
+ return summarizers;
+ }
+ for (int i = 0; i < searcher.size(); ++i) {
+ summarizers.add(new OnlineSummarizer());
+ }
+ for (Vector v : datapoints) {
+ Centroid closest = (Centroid)searcher.search(v, 1).get(0).getValue();
+ OnlineSummarizer summarizer = summarizers.get(closest.getIndex());
+ summarizer.add(distanceMeasure.distance(v, closest));
+ }
+ return summarizers;
+ }
+
+ /**
+ * Adds up the distances from each point to its closest cluster and returns the sum.
+ * @param datapoints iterable of datapoints.
+ * @param centroids iterable of Centroids.
+ * @return the total cost described above.
+ */
+ public static double totalClusterCost(Iterable<? extends Vector> datapoints, Iterable<? extends Vector> centroids) {
+ DistanceMeasure distanceMeasure = new EuclideanDistanceMeasure();
+ UpdatableSearcher searcher = new ProjectionSearch(distanceMeasure, 3, 1);
+ searcher.addAll(centroids);
+ return totalClusterCost(datapoints, searcher);
+ }
+
+ /**
+ * Adds up the distances from each point to its closest cluster and returns the sum.
+ * @param datapoints iterable of datapoints.
+ * @param centroids searcher of Centroids.
+ * @return the total cost described above.
+ */
+ public static double totalClusterCost(Iterable<? extends Vector> datapoints, Searcher centroids) {
+ double totalCost = 0;
+ for (Vector vector : datapoints) {
+ totalCost += centroids.searchFirst(vector, false).getWeight();
+ }
+ return totalCost;
+ }
+
+ /**
+ * Estimates the distance cutoff. In StreamingKMeans, the distance between two vectors divided
+ * by this value is used as a probability threshold when deciding whether to form a new cluster
+ * or not.
+ * Small values (comparable to the minimum distance between two points) are preferred as they
+ * guarantee with high likelihood that all but very close points are put in separate clusters
+ * initially. The clusters themselves are actually collapsed periodically when their number goes
+ * over the maximum number of clusters and the distanceCutoff is increased.
+ * So, the returned value is only an initial estimate.
+ * @param data the datapoints whose distance is to be estimated.
+ * @param distanceMeasure the distance measure used to compute the distance between two points.
+ * @return the minimum distance between the first sampleLimit points
+ * @see org.apache.mahout.clustering.streaming.cluster.StreamingKMeans#clusterInternal(Iterable, boolean)
+ */
+ public static double estimateDistanceCutoff(List<? extends Vector> data, DistanceMeasure distanceMeasure) {
+ BruteSearch searcher = new BruteSearch(distanceMeasure);
+ searcher.addAll(data);
+ double minDistance = Double.POSITIVE_INFINITY;
+ for (Vector vector : data) {
+ double closest = searcher.searchFirst(vector, true).getWeight();
+ if (minDistance > 0 && closest < minDistance) {
+ minDistance = closest;
+ }
+ searcher.add(vector);
+ }
+ return minDistance;
+ }
+
+ public static <T extends Vector> double estimateDistanceCutoff(
+ Iterable<T> data, DistanceMeasure distanceMeasure, int sampleLimit) {
+ return estimateDistanceCutoff(Lists.newArrayList(Iterables.limit(data, sampleLimit)), distanceMeasure);
+ }
+
+ /**
+ * Computes the Davies-Bouldin Index for a given clustering.
+ * See http://en.wikipedia.org/wiki/Clustering_algorithm#Internal_evaluation
+ * @param centroids list of centroids
+ * @param distanceMeasure distance measure for inter-cluster distances
+ * @param clusterDistanceSummaries summaries of the clusters; See summarizeClusterDistances
+ * @return the Davies-Bouldin Index
+ */
+ public static double daviesBouldinIndex(List<? extends Vector> centroids, DistanceMeasure distanceMeasure,
+ List<OnlineSummarizer> clusterDistanceSummaries) {
+ Preconditions.checkArgument(centroids.size() == clusterDistanceSummaries.size(),
+ "Number of centroids and cluster summaries differ.");
+ int n = centroids.size();
+ double totalDBIndex = 0;
+ // The inner loop shouldn't be reduced for j = i + 1 to n because the computation of the Davies-Bouldin
+ // index is not really symmetric.
+ // For a given cluster i, we look for a cluster j that maximizes the ratio of the sum of average distances
+ // from points in cluster i to its center and and points in cluster j to its center to the distance between
+ // cluster i and cluster j.
+ // The maximization is the key issue, as the cluster that maximizes this ratio might be j for i but is NOT
+ // NECESSARILY i for j.
+ for (int i = 0; i < n; ++i) {
+ double averageDistanceI = clusterDistanceSummaries.get(i).getMean();
+ double maxDBIndex = 0;
+ for (int j = 0; j < n; ++j) {
+ if (i != j) {
+ double dbIndex = (averageDistanceI + clusterDistanceSummaries.get(j).getMean())
+ / distanceMeasure.distance(centroids.get(i), centroids.get(j));
+ if (dbIndex > maxDBIndex) {
+ maxDBIndex = dbIndex;
+ }
+ }
+ }
+ totalDBIndex += maxDBIndex;
+ }
+ return totalDBIndex / n;
+ }
+
+ /**
+ * Computes the Dunn Index of a given clustering. See http://en.wikipedia.org/wiki/Dunn_index
+ * @param centroids list of centroids
+ * @param distanceMeasure distance measure to compute inter-centroid distance with
+ * @param clusterDistanceSummaries summaries of the clusters; See summarizeClusterDistances
+ * @return the Dunn Index
+ */
+ public static double dunnIndex(List<? extends Vector> centroids, DistanceMeasure distanceMeasure,
+ List<OnlineSummarizer> clusterDistanceSummaries) {
+ Preconditions.checkArgument(centroids.size() == clusterDistanceSummaries.size(),
+ "Number of centroids and cluster summaries differ.");
+ int n = centroids.size();
+ // Intra-cluster distances will come from the OnlineSummarizer, and will be the median distance (noting that
+ // the median for just one value is that value).
+ // A variety of metrics can be used for the intra-cluster distance including max distance between two points,
+ // mean distance, etc. Median distance was chosen as this is more robust to outliers and characterizes the
+ // distribution of distances (from a point to the center) better.
+ double maxIntraClusterDistance = 0;
+ for (OnlineSummarizer summarizer : clusterDistanceSummaries) {
+ if (summarizer.getCount() > 0) {
+ double intraClusterDistance;
+ if (summarizer.getCount() == 1) {
+ intraClusterDistance = summarizer.getMean();
+ } else {
+ intraClusterDistance = summarizer.getMedian();
+ }
+ if (maxIntraClusterDistance < intraClusterDistance) {
+ maxIntraClusterDistance = intraClusterDistance;
+ }
+ }
+ }
+ double minDunnIndex = Double.POSITIVE_INFINITY;
+ for (int i = 0; i < n; ++i) {
+ // Distances are symmetric, so d(i, j) = d(j, i).
+ for (int j = i + 1; j < n; ++j) {
+ double dunnIndex = distanceMeasure.distance(centroids.get(i), centroids.get(j));
+ if (minDunnIndex > dunnIndex) {
+ minDunnIndex = dunnIndex;
+ }
+ }
+ }
+ return minDunnIndex / maxIntraClusterDistance;
+ }
+
+ public static double choose2(double n) {
+ return n * (n - 1) / 2;
+ }
+
+ /**
+ * Creates a confusion matrix by searching for the closest cluster of both the row clustering and column clustering
+ * of a point and adding its weight to that cell of the matrix.
+ * It doesn't matter which clustering is the row clustering and which is the column clustering. If they're
+ * interchanged, the resulting matrix is the transpose of the original one.
+ * @param rowCentroids clustering one
+ * @param columnCentroids clustering two
+ * @param datapoints datapoints whose closest cluster we need to find
+ * @param distanceMeasure distance measure to use
+ * @return the confusion matrix
+ */
+ public static Matrix getConfusionMatrix(List<? extends Vector> rowCentroids, List<? extends Vector> columnCentroids,
+ Iterable<? extends Vector> datapoints, DistanceMeasure distanceMeasure) {
+ Searcher rowSearcher = new BruteSearch(distanceMeasure);
+ rowSearcher.addAll(rowCentroids);
+ Searcher columnSearcher = new BruteSearch(distanceMeasure);
+ columnSearcher.addAll(columnCentroids);
+
+ int numRows = rowCentroids.size();
+ int numCols = columnCentroids.size();
+ Matrix confusionMatrix = new DenseMatrix(numRows, numCols);
+
+ for (Vector vector : datapoints) {
+ WeightedThing<Vector> closestRowCentroid = rowSearcher.search(vector, 1).get(0);
+ WeightedThing<Vector> closestColumnCentroid = columnSearcher.search(vector, 1).get(0);
+ int row = ((Centroid) closestRowCentroid.getValue()).getIndex();
+ int column = ((Centroid) closestColumnCentroid.getValue()).getIndex();
+ double vectorWeight;
+ if (vector instanceof WeightedVector) {
+ vectorWeight = ((WeightedVector) vector).getWeight();
+ } else {
+ vectorWeight = 1;
+ }
+ confusionMatrix.set(row, column, confusionMatrix.get(row, column) + vectorWeight);
+ }
+
+ return confusionMatrix;
+ }
+
+ /**
+ * Computes the Adjusted Rand Index for a given confusion matrix.
+ * @param confusionMatrix confusion matrix; not to be confused with the more restrictive ConfusionMatrix class
+ * @return the Adjusted Rand Index
+ */
+ public static double getAdjustedRandIndex(Matrix confusionMatrix) {
+ int numRows = confusionMatrix.numRows();
+ int numCols = confusionMatrix.numCols();
+ double rowChoiceSum = 0;
+ double columnChoiceSum = 0;
+ double totalChoiceSum = 0;
+ double total = 0;
+ for (int i = 0; i < numRows; ++i) {
+ double rowSum = 0;
+ for (int j = 0; j < numCols; ++j) {
+ rowSum += confusionMatrix.get(i, j);
+ totalChoiceSum += choose2(confusionMatrix.get(i, j));
+ }
+ total += rowSum;
+ rowChoiceSum += choose2(rowSum);
+ }
+ for (int j = 0; j < numCols; ++j) {
+ double columnSum = 0;
+ for (int i = 0; i < numRows; ++i) {
+ columnSum += confusionMatrix.get(i, j);
+ }
+ columnChoiceSum += choose2(columnSum);
+ }
+ double rowColumnChoiceSumDivTotal = rowChoiceSum * columnChoiceSum / choose2(total);
+ return (totalChoiceSum - rowColumnChoiceSumDivTotal)
+ / ((rowChoiceSum + columnChoiceSum) / 2 - rowColumnChoiceSumDivTotal);
+ }
+
+ /**
+ * Computes the total weight of the points in the given Vector iterable.
+ * @param data iterable of points
+ * @return total weight
+ */
+ public static double totalWeight(Iterable<? extends Vector> data) {
+ double sum = 0;
+ for (Vector row : data) {
+ Preconditions.checkNotNull(row);
+ if (row instanceof WeightedVector) {
+ sum += ((WeightedVector)row).getWeight();
+ } else {
+ sum++;
+ }
+ }
+ return sum;
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/GaussianAccumulator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/GaussianAccumulator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/GaussianAccumulator.java
new file mode 100644
index 0000000..c25e039
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/GaussianAccumulator.java
@@ -0,0 +1,62 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering;
+
+import org.apache.mahout.math.Vector;
+
+public interface GaussianAccumulator {
+
+ /**
+ * @return the number of observations
+ */
+ double getN();
+
+ /**
+ * @return the mean of the observations
+ */
+ Vector getMean();
+
+ /**
+ * @return the std of the observations
+ */
+ Vector getStd();
+
+ /**
+ * @return the average of the vector std elements
+ */
+ double getAverageStd();
+
+ /**
+ * @return the variance of the observations
+ */
+ Vector getVariance();
+
+ /**
+ * Observe the vector
+ *
+ * @param x a Vector
+ * @param weight the double observation weight (usually 1.0)
+ */
+ void observe(Vector x, double weight);
+
+ /**
+ * Compute the mean, variance and standard deviation
+ */
+ void compute();
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/Model.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/Model.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/Model.java
new file mode 100644
index 0000000..79dab30
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/Model.java
@@ -0,0 +1,93 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering;
+
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.math.VectorWritable;
+
+/**
+ * A model is a probability distribution over observed data points and allows
+ * the probability of any data point to be computed. All Models have a
+ * persistent representation and extend
+ * WritablesampleFromPosterior(Model<VectorWritable>[])
+ */
+public interface Model<O> extends Writable {
+
+ /**
+ * Return the probability that the observation is described by this model
+ *
+ * @param x
+ * an Observation from the posterior
+ * @return the probability that x is in the receiver
+ */
+ double pdf(O x);
+
+ /**
+ * Observe the given observation, retaining information about it
+ *
+ * @param x
+ * an Observation from the posterior
+ */
+ void observe(O x);
+
+ /**
+ * Observe the given observation, retaining information about it
+ *
+ * @param x
+ * an Observation from the posterior
+ * @param weight
+ * a double weighting factor
+ */
+ void observe(O x, double weight);
+
+ /**
+ * Observe the given model, retaining information about its observations
+ *
+ * @param x
+ * a Model<0>
+ */
+ void observe(Model<O> x);
+
+ /**
+ * Compute a new set of posterior parameters based upon the Observations that
+ * have been observed since my creation
+ */
+ void computeParameters();
+
+ /**
+ * Return the number of observations that this model has seen since its
+ * parameters were last computed
+ *
+ * @return a long
+ */
+ long getNumObservations();
+
+ /**
+ * Return the number of observations that this model has seen over its
+ * lifetime
+ *
+ * @return a long
+ */
+ long getTotalObservations();
+
+ /**
+ * @return a sample of my posterior model
+ */
+ Model<VectorWritable> sampleFromPosterior();
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/ModelDistribution.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/ModelDistribution.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/ModelDistribution.java
new file mode 100644
index 0000000..d77bf40
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/ModelDistribution.java
@@ -0,0 +1,41 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering;
+
+/** A model distribution allows us to sample a model from its prior distribution. */
+public interface ModelDistribution<O> {
+
+ /**
+ * Return a list of models sampled from the prior
+ *
+ * @param howMany
+ * the int number of models to return
+ * @return a Model<Observation>[] representing what is known apriori
+ */
+ Model<O>[] sampleFromPrior(int howMany);
+
+ /**
+ * Return a list of models sampled from the posterior
+ *
+ * @param posterior
+ * the Model<Observation>[] after observations
+ * @return a Model<Observation>[] representing what is known apriori
+ */
+ Model<O>[] sampleFromPosterior(Model<O>[] posterior);
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/OnlineGaussianAccumulator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/OnlineGaussianAccumulator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/OnlineGaussianAccumulator.java
new file mode 100644
index 0000000..b76e00f
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/OnlineGaussianAccumulator.java
@@ -0,0 +1,107 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.clustering;
+
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.function.SquareRootFunction;
+
+/**
+ * An online Gaussian statistics accumulator based upon Knuth (who cites Welford) which is declared to be
+ * numerically-stable. See http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
+ */
+public class OnlineGaussianAccumulator implements GaussianAccumulator {
+
+ private double sumWeight;
+ private Vector mean;
+ private Vector s;
+ private Vector variance;
+
+ @Override
+ public double getN() {
+ return sumWeight;
+ }
+
+ @Override
+ public Vector getMean() {
+ return mean;
+ }
+
+ @Override
+ public Vector getStd() {
+ return variance.clone().assign(new SquareRootFunction());
+ }
+
+ /* from Wikipedia: http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
+ *
+ * Weighted incremental algorithm
+ *
+ * def weighted_incremental_variance(dataWeightPairs):
+ * mean = 0
+ * S = 0
+ * sumweight = 0
+ * for x, weight in dataWeightPairs: # Alternately "for x in zip(data, weight):"
+ * temp = weight + sumweight
+ * Q = x - mean
+ * R = Q * weight / temp
+ * S = S + sumweight * Q * R
+ * mean = mean + R
+ * sumweight = temp
+ * Variance = S / (sumweight-1) # if sample is the population, omit -1
+ * return Variance
+ */
+ @Override
+ public void observe(Vector x, double weight) {
+ double temp = weight + sumWeight;
+ Vector q;
+ if (mean == null) {
+ mean = x.like();
+ q = x.clone();
+ } else {
+ q = x.minus(mean);
+ }
+ Vector r = q.times(weight).divide(temp);
+ if (s == null) {
+ s = q.times(sumWeight).times(r);
+ } else {
+ s = s.plus(q.times(sumWeight).times(r));
+ }
+ mean = mean.plus(r);
+ sumWeight = temp;
+ variance = s.divide(sumWeight - 1); // # if sample is the population, omit -1
+ }
+
+ @Override
+ public void compute() {
+ // nothing to do here!
+ }
+
+ @Override
+ public double getAverageStd() {
+ if (sumWeight == 0.0) {
+ return 0.0;
+ } else {
+ Vector std = getStd();
+ return std.zSum() / std.size();
+ }
+ }
+
+ @Override
+ public Vector getVariance() {
+ return variance;
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/RunningSumsGaussianAccumulator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/RunningSumsGaussianAccumulator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/RunningSumsGaussianAccumulator.java
new file mode 100644
index 0000000..138e830
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/RunningSumsGaussianAccumulator.java
@@ -0,0 +1,90 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.clustering;
+
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.function.Functions;
+import org.apache.mahout.math.function.SquareRootFunction;
+
+/**
+ * An online Gaussian accumulator that uses a running power sums approach as reported
+ * on http://en.wikipedia.org/wiki/Standard_deviation
+ * Suffers from overflow, underflow and roundoff error but has minimal observe-time overhead
+ */
+public class RunningSumsGaussianAccumulator implements GaussianAccumulator {
+
+ private double s0;
+ private Vector s1;
+ private Vector s2;
+ private Vector mean;
+ private Vector std;
+
+ @Override
+ public double getN() {
+ return s0;
+ }
+
+ @Override
+ public Vector getMean() {
+ return mean;
+ }
+
+ @Override
+ public Vector getStd() {
+ return std;
+ }
+
+ @Override
+ public double getAverageStd() {
+ if (s0 == 0.0) {
+ return 0.0;
+ } else {
+ return std.zSum() / std.size();
+ }
+ }
+
+ @Override
+ public Vector getVariance() {
+ return std.times(std);
+ }
+
+ @Override
+ public void observe(Vector x, double weight) {
+ s0 += weight;
+ Vector weightedX = x.times(weight);
+ if (s1 == null) {
+ s1 = weightedX;
+ } else {
+ s1.assign(weightedX, Functions.PLUS);
+ }
+ Vector x2 = x.times(x).times(weight);
+ if (s2 == null) {
+ s2 = x2;
+ } else {
+ s2.assign(x2, Functions.PLUS);
+ }
+ }
+
+ @Override
+ public void compute() {
+ if (s0 != 0.0) {
+ mean = s1.divide(s0);
+ std = s2.times(s0).minus(s1.times(s1)).assign(new SquareRootFunction()).divide(s0);
+ }
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/UncommonDistributions.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/UncommonDistributions.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/UncommonDistributions.java
new file mode 100644
index 0000000..ef43e1b
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/UncommonDistributions.java
@@ -0,0 +1,136 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering;
+
+import org.apache.commons.math3.distribution.NormalDistribution;
+import org.apache.commons.math3.distribution.RealDistribution;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.common.RandomWrapper;
+
+public final class UncommonDistributions {
+
+ private static final RandomWrapper RANDOM = RandomUtils.getRandom();
+
+ private UncommonDistributions() {}
+
+ // =============== start of BSD licensed code. See LICENSE.txt
+ /**
+ * Returns a double sampled according to this distribution. Uniformly fast for all k > 0. (Reference:
+ * Non-Uniform Random Variate Generation, Devroye http://cgm.cs.mcgill.ca/~luc/rnbookindex.html) Uses
+ * Cheng's rejection algorithm (GB) for k>=1, rejection from Weibull distribution for 0 < k < 1.
+ */
+ public static double rGamma(double k, double lambda) {
+ boolean accept = false;
+ if (k >= 1.0) {
+ // Cheng's algorithm
+ double b = k - Math.log(4.0);
+ double c = k + Math.sqrt(2.0 * k - 1.0);
+ double lam = Math.sqrt(2.0 * k - 1.0);
+ double cheng = 1.0 + Math.log(4.5);
+ double x;
+ do {
+ double u = RANDOM.nextDouble();
+ double v = RANDOM.nextDouble();
+ double y = 1.0 / lam * Math.log(v / (1.0 - v));
+ x = k * Math.exp(y);
+ double z = u * v * v;
+ double r = b + c * y - x;
+ if (r >= 4.5 * z - cheng || r >= Math.log(z)) {
+ accept = true;
+ }
+ } while (!accept);
+ return x / lambda;
+ } else {
+ // Weibull algorithm
+ double c = 1.0 / k;
+ double d = (1.0 - k) * Math.pow(k, k / (1.0 - k));
+ double x;
+ do {
+ double u = RANDOM.nextDouble();
+ double v = RANDOM.nextDouble();
+ double z = -Math.log(u);
+ double e = -Math.log(v);
+ x = Math.pow(z, c);
+ if (z + e >= d + x) {
+ accept = true;
+ }
+ } while (!accept);
+ return x / lambda;
+ }
+ }
+
+ // ============= end of BSD licensed code
+
+ /**
+ * Returns a random sample from a beta distribution with the given shapes
+ *
+ * @param shape1
+ * a double representing shape1
+ * @param shape2
+ * a double representing shape2
+ * @return a Vector of samples
+ */
+ public static double rBeta(double shape1, double shape2) {
+ double gam1 = rGamma(shape1, 1.0);
+ double gam2 = rGamma(shape2, 1.0);
+ return gam1 / (gam1 + gam2);
+
+ }
+
+ /**
+ * Return a random value from a normal distribution with the given mean and standard deviation
+ *
+ * @param mean
+ * a double mean value
+ * @param sd
+ * a double standard deviation
+ * @return a double sample
+ */
+ public static double rNorm(double mean, double sd) {
+ RealDistribution dist = new NormalDistribution(RANDOM.getRandomGenerator(),
+ mean,
+ sd,
+ NormalDistribution.DEFAULT_INVERSE_ABSOLUTE_ACCURACY);
+ return dist.sample();
+ }
+
+ /**
+ * Returns an integer sampled according to this distribution. Takes time proportional to np + 1. (Reference:
+ * Non-Uniform Random Variate Generation, Devroye http://cgm.cs.mcgill.ca/~luc/rnbookindex.html) Second
+ * time-waiting algorithm.
+ */
+ public static int rBinomial(int n, double p) {
+ if (p >= 1.0) {
+ return n; // needed to avoid infinite loops and negative results
+ }
+ double q = -Math.log1p(-p);
+ double sum = 0.0;
+ int x = 0;
+ while (sum <= q) {
+ double u = RANDOM.nextDouble();
+ double e = -Math.log(u);
+ sum += e / (n - x);
+ x++;
+ }
+ if (x == 0) {
+ return 0;
+ }
+ return x - 1;
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/canopy/Canopy.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/canopy/Canopy.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/canopy/Canopy.java
new file mode 100644
index 0000000..930fd44
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/canopy/Canopy.java
@@ -0,0 +1,60 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.canopy;
+
+import org.apache.mahout.clustering.iterator.DistanceMeasureCluster;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.math.Vector;
+
+/**
+ * This class models a canopy as a center point, the number of points that are contained within it according
+ * to the application of some distance metric, and a point total which is the sum of all the points and is
+ * used to compute the centroid when needed.
+ */
+@Deprecated
+public class Canopy extends DistanceMeasureCluster {
+
+ /** Used for deserialization as a writable */
+ public Canopy() { }
+
+ /**
+ * Create a new Canopy containing the given point and canopyId
+ *
+ * @param center a point in vector space
+ * @param canopyId an int identifying the canopy local to this process only
+ * @param measure a DistanceMeasure to use
+ */
+ public Canopy(Vector center, int canopyId, DistanceMeasure measure) {
+ super(center, canopyId, measure);
+ observe(center);
+ }
+
+ public String asFormatString() {
+ return "C" + this.getId() + ": " + this.computeCentroid().asFormatString();
+ }
+
+ @Override
+ public String toString() {
+ return getIdentifier() + ": " + getCenter().asFormatString();
+ }
+
+ @Override
+ public String getIdentifier() {
+ return "C-" + getId();
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/canopy/CanopyClusterer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/canopy/CanopyClusterer.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/canopy/CanopyClusterer.java
new file mode 100644
index 0000000..3ce4757
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/canopy/CanopyClusterer.java
@@ -0,0 +1,220 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.canopy;
+
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.List;
+
+import org.apache.mahout.clustering.AbstractCluster;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.math.Vector;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.google.common.collect.Lists;
+
+@Deprecated
+public class CanopyClusterer {
+
+ private static final Logger log = LoggerFactory.getLogger(CanopyClusterer.class);
+
+ private int nextCanopyId;
+
+ // the T1 distance threshold
+ private double t1;
+
+ // the T2 distance threshold
+ private double t2;
+
+ // the T3 distance threshold
+ private double t3;
+
+ // the T4 distance threshold
+ private double t4;
+
+ // the distance measure
+ private DistanceMeasure measure;
+
+ public CanopyClusterer(DistanceMeasure measure, double t1, double t2) {
+ this.t1 = t1;
+ this.t2 = t2;
+ this.t3 = t1;
+ this.t4 = t2;
+ this.measure = measure;
+ }
+
+ public double getT1() {
+ return t1;
+ }
+
+ public double getT2() {
+ return t2;
+ }
+
+ public double getT3() {
+ return t3;
+ }
+
+ public double getT4() {
+ return t4;
+ }
+
+ /**
+ * Used by CanopyReducer to set t1=t3 and t2=t4 configuration values
+ */
+ public void useT3T4() {
+ t1 = t3;
+ t2 = t4;
+ }
+
+ /**
+ * This is the same algorithm as the reference but inverted to iterate over
+ * existing canopies instead of the points. Because of this it does not need
+ * to actually store the points, instead storing a total points vector and
+ * the number of points. From this a centroid can be computed.
+ * <p/>
+ * This method is used by the CanopyMapper, CanopyReducer and CanopyDriver.
+ *
+ * @param point
+ * the point to be added
+ * @param canopies
+ * the List<Canopy> to be appended
+ */
+ public void addPointToCanopies(Vector point, Collection<Canopy> canopies) {
+ boolean pointStronglyBound = false;
+ for (Canopy canopy : canopies) {
+ double dist = measure.distance(canopy.getCenter().getLengthSquared(), canopy.getCenter(), point);
+ if (dist < t1) {
+ if (log.isDebugEnabled()) {
+ log.debug("Added point: {} to canopy: {}", AbstractCluster.formatVector(point, null), canopy.getIdentifier());
+ }
+ canopy.observe(point);
+ }
+ pointStronglyBound = pointStronglyBound || dist < t2;
+ }
+ if (!pointStronglyBound) {
+ if (log.isDebugEnabled()) {
+ log.debug("Created new Canopy:{} at center:{}", nextCanopyId, AbstractCluster.formatVector(point, null));
+ }
+ canopies.add(new Canopy(point, nextCanopyId++, measure));
+ }
+ }
+
+ /**
+ * Return if the point is covered by the canopy
+ *
+ * @param point
+ * a point
+ * @return if the point is covered
+ */
+ public boolean canopyCovers(Canopy canopy, Vector point) {
+ return measure.distance(canopy.getCenter().getLengthSquared(), canopy.getCenter(), point) < t1;
+ }
+
+ /**
+ * Iterate through the points, adding new canopies. Return the canopies.
+ *
+ * @param points
+ * a list<Vector> defining the points to be clustered
+ * @param measure
+ * a DistanceMeasure to use
+ * @param t1
+ * the T1 distance threshold
+ * @param t2
+ * the T2 distance threshold
+ * @return the List<Canopy> created
+ */
+ public static List<Canopy> createCanopies(List<Vector> points,
+ DistanceMeasure measure,
+ double t1,
+ double t2) {
+ List<Canopy> canopies = Lists.newArrayList();
+ /**
+ * Reference Implementation: Given a distance metric, one can create
+ * canopies as follows: Start with a list of the data points in any
+ * order, and with two distance thresholds, T1 and T2, where T1 > T2.
+ * (These thresholds can be set by the user, or selected by
+ * cross-validation.) Pick a point on the list and measure its distance
+ * to all other points. Put all points that are within distance
+ * threshold T1 into a canopy. Remove from the list all points that are
+ * within distance threshold T2. Repeat until the list is empty.
+ */
+ int nextCanopyId = 0;
+ while (!points.isEmpty()) {
+ Iterator<Vector> ptIter = points.iterator();
+ Vector p1 = ptIter.next();
+ ptIter.remove();
+ Canopy canopy = new Canopy(p1, nextCanopyId++, measure);
+ canopies.add(canopy);
+ while (ptIter.hasNext()) {
+ Vector p2 = ptIter.next();
+ double dist = measure.distance(p1, p2);
+ // Put all points that are within distance threshold T1 into the
+ // canopy
+ if (dist < t1) {
+ canopy.observe(p2);
+ }
+ // Remove from the list all points that are within distance
+ // threshold T2
+ if (dist < t2) {
+ ptIter.remove();
+ }
+ }
+ for (Canopy c : canopies) {
+ c.computeParameters();
+ }
+ }
+ return canopies;
+ }
+
+ /**
+ * Iterate through the canopies, adding their centroids to a list
+ *
+ * @param canopies
+ * a List<Canopy>
+ * @return the List<Vector>
+ */
+ public static List<Vector> getCenters(Iterable<Canopy> canopies) {
+ List<Vector> result = Lists.newArrayList();
+ for (Canopy canopy : canopies) {
+ result.add(canopy.getCenter());
+ }
+ return result;
+ }
+
+ /**
+ * Iterate through the canopies, resetting their center to their centroids
+ *
+ * @param canopies
+ * a List<Canopy>
+ */
+ public static void updateCentroids(Iterable<Canopy> canopies) {
+ for (Canopy canopy : canopies) {
+ canopy.computeParameters();
+ }
+ }
+
+ public void setT3(double t3) {
+ this.t3 = t3;
+ }
+
+ public void setT4(double t4) {
+ this.t4 = t4;
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/canopy/CanopyConfigKeys.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/canopy/CanopyConfigKeys.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/canopy/CanopyConfigKeys.java
new file mode 100644
index 0000000..2f24026
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/canopy/CanopyConfigKeys.java
@@ -0,0 +1,70 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.canopy;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.mahout.common.ClassUtils;
+import org.apache.mahout.common.distance.DistanceMeasure;
+
+@Deprecated
+public final class CanopyConfigKeys {
+
+ private CanopyConfigKeys() {}
+
+ public static final String T1_KEY = "org.apache.mahout.clustering.canopy.t1";
+
+ public static final String T2_KEY = "org.apache.mahout.clustering.canopy.t2";
+
+ public static final String T3_KEY = "org.apache.mahout.clustering.canopy.t3";
+
+ public static final String T4_KEY = "org.apache.mahout.clustering.canopy.t4";
+
+ // keys used by Driver, Mapper, Combiner & Reducer
+ public static final String DISTANCE_MEASURE_KEY = "org.apache.mahout.clustering.canopy.measure";
+
+ public static final String CF_KEY = "org.apache.mahout.clustering.canopy.canopyFilter";
+
+ /**
+ * Create a {@link CanopyClusterer} from the Hadoop configuration.
+ *
+ * @param configuration Hadoop configuration
+ *
+ * @return CanopyClusterer
+ */
+ public static CanopyClusterer configureCanopyClusterer(Configuration configuration) {
+ double t1 = Double.parseDouble(configuration.get(T1_KEY));
+ double t2 = Double.parseDouble(configuration.get(T2_KEY));
+
+ DistanceMeasure measure = ClassUtils.instantiateAs(configuration.get(DISTANCE_MEASURE_KEY), DistanceMeasure.class);
+ measure.configure(configuration);
+
+ CanopyClusterer canopyClusterer = new CanopyClusterer(measure, t1, t2);
+
+ String d = configuration.get(T3_KEY);
+ if (d != null) {
+ canopyClusterer.setT3(Double.parseDouble(d));
+ }
+
+ d = configuration.get(T4_KEY);
+ if (d != null) {
+ canopyClusterer.setT4(Double.parseDouble(d));
+ }
+ return canopyClusterer;
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/canopy/CanopyDriver.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/canopy/CanopyDriver.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/canopy/CanopyDriver.java
new file mode 100644
index 0000000..06dc947
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/clustering/canopy/CanopyDriver.java
@@ -0,0 +1,379 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.canopy;
+
+import java.io.IOException;
+import java.util.Collection;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.clustering.AbstractCluster;
+import org.apache.mahout.clustering.Cluster;
+import org.apache.mahout.clustering.classify.ClusterClassificationDriver;
+import org.apache.mahout.clustering.classify.ClusterClassifier;
+import org.apache.mahout.clustering.iterator.CanopyClusteringPolicy;
+import org.apache.mahout.clustering.iterator.ClusterWritable;
+import org.apache.mahout.clustering.topdown.PathDirectory;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.ClassUtils;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
+import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterable;
+import org.apache.mahout.math.VectorWritable;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.google.common.collect.Lists;
+import com.google.common.io.Closeables;
+
+@Deprecated
+public class CanopyDriver extends AbstractJob {
+
+ public static final String DEFAULT_CLUSTERED_POINTS_DIRECTORY = "clusteredPoints";
+
+ private static final Logger log = LoggerFactory.getLogger(CanopyDriver.class);
+
+ public static void main(String[] args) throws Exception {
+ ToolRunner.run(new Configuration(), new CanopyDriver(), args);
+ }
+
+ @Override
+ public int run(String[] args) throws Exception {
+
+ addInputOption();
+ addOutputOption();
+ addOption(DefaultOptionCreator.distanceMeasureOption().create());
+ addOption(DefaultOptionCreator.t1Option().create());
+ addOption(DefaultOptionCreator.t2Option().create());
+ addOption(DefaultOptionCreator.t3Option().create());
+ addOption(DefaultOptionCreator.t4Option().create());
+ addOption(DefaultOptionCreator.clusterFilterOption().create());
+ addOption(DefaultOptionCreator.overwriteOption().create());
+ addOption(DefaultOptionCreator.clusteringOption().create());
+ addOption(DefaultOptionCreator.methodOption().create());
+ addOption(DefaultOptionCreator.outlierThresholdOption().create());
+
+ if (parseArguments(args) == null) {
+ return -1;
+ }
+
+ Path input = getInputPath();
+ Path output = getOutputPath();
+ Configuration conf = getConf();
+ if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
+ HadoopUtil.delete(conf, output);
+ }
+ String measureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION);
+ double t1 = Double.parseDouble(getOption(DefaultOptionCreator.T1_OPTION));
+ double t2 = Double.parseDouble(getOption(DefaultOptionCreator.T2_OPTION));
+ double t3 = t1;
+ if (hasOption(DefaultOptionCreator.T3_OPTION)) {
+ t3 = Double.parseDouble(getOption(DefaultOptionCreator.T3_OPTION));
+ }
+ double t4 = t2;
+ if (hasOption(DefaultOptionCreator.T4_OPTION)) {
+ t4 = Double.parseDouble(getOption(DefaultOptionCreator.T4_OPTION));
+ }
+ int clusterFilter = 0;
+ if (hasOption(DefaultOptionCreator.CLUSTER_FILTER_OPTION)) {
+ clusterFilter = Integer
+ .parseInt(getOption(DefaultOptionCreator.CLUSTER_FILTER_OPTION));
+ }
+ boolean runClustering = hasOption(DefaultOptionCreator.CLUSTERING_OPTION);
+ boolean runSequential = getOption(DefaultOptionCreator.METHOD_OPTION)
+ .equalsIgnoreCase(DefaultOptionCreator.SEQUENTIAL_METHOD);
+ DistanceMeasure measure = ClassUtils.instantiateAs(measureClass, DistanceMeasure.class);
+ double clusterClassificationThreshold = 0.0;
+ if (hasOption(DefaultOptionCreator.OUTLIER_THRESHOLD)) {
+ clusterClassificationThreshold = Double.parseDouble(getOption(DefaultOptionCreator.OUTLIER_THRESHOLD));
+ }
+ run(conf, input, output, measure, t1, t2, t3, t4, clusterFilter,
+ runClustering, clusterClassificationThreshold, runSequential);
+ return 0;
+ }
+
+ /**
+ * Build a directory of Canopy clusters from the input arguments and, if
+ * requested, cluster the input vectors using these clusters
+ *
+ * @param conf
+ * the Configuration
+ * @param input
+ * the Path to the directory containing input vectors
+ * @param output
+ * the Path for all output directories
+ * @param measure
+ * the DistanceMeasure
+ * @param t1
+ * the double T1 distance metric
+ * @param t2
+ * the double T2 distance metric
+ * @param t3
+ * the reducer's double T1 distance metric
+ * @param t4
+ * the reducer's double T2 distance metric
+ * @param clusterFilter
+ * the minimum canopy size output by the mappers
+ * @param runClustering
+ * cluster the input vectors if true
+ * @param clusterClassificationThreshold
+ * vectors having pdf below this value will not be clustered. Its value should be between 0 and 1.
+ * @param runSequential
+ * execute sequentially if true
+ */
+ public static void run(Configuration conf, Path input, Path output,
+ DistanceMeasure measure, double t1, double t2, double t3, double t4,
+ int clusterFilter, boolean runClustering, double clusterClassificationThreshold, boolean runSequential)
+ throws IOException, InterruptedException, ClassNotFoundException {
+ Path clustersOut = buildClusters(conf, input, output, measure, t1, t2, t3,
+ t4, clusterFilter, runSequential);
+ if (runClustering) {
+ clusterData(conf, input, clustersOut, output, clusterClassificationThreshold, runSequential);
+ }
+ }
+
+ /**
+ * Convenience method to provide backward compatibility
+ */
+ public static void run(Configuration conf, Path input, Path output,
+ DistanceMeasure measure, double t1, double t2, boolean runClustering,
+ double clusterClassificationThreshold, boolean runSequential) throws IOException, InterruptedException,
+ ClassNotFoundException {
+ run(conf, input, output, measure, t1, t2, t1, t2, 0, runClustering,
+ clusterClassificationThreshold, runSequential);
+ }
+
+ /**
+ * Convenience method creates new Configuration() Build a directory of Canopy
+ * clusters from the input arguments and, if requested, cluster the input
+ * vectors using these clusters
+ *
+ * @param input
+ * the Path to the directory containing input vectors
+ * @param output
+ * the Path for all output directories
+ * @param t1
+ * the double T1 distance metric
+ * @param t2
+ * the double T2 distance metric
+ * @param runClustering
+ * cluster the input vectors if true
+ * @param clusterClassificationThreshold
+ * vectors having pdf below this value will not be clustered. Its value should be between 0 and 1.
+ * @param runSequential
+ * execute sequentially if true
+ */
+ public static void run(Path input, Path output, DistanceMeasure measure,
+ double t1, double t2, boolean runClustering, double clusterClassificationThreshold, boolean runSequential)
+ throws IOException, InterruptedException, ClassNotFoundException {
+ run(new Configuration(), input, output, measure, t1, t2, runClustering,
+ clusterClassificationThreshold, runSequential);
+ }
+
+ /**
+ * Convenience method for backwards compatibility
+ *
+ */
+ public static Path buildClusters(Configuration conf, Path input, Path output,
+ DistanceMeasure measure, double t1, double t2, int clusterFilter,
+ boolean runSequential) throws IOException, InterruptedException,
+ ClassNotFoundException {
+ return buildClusters(conf, input, output, measure, t1, t2, t1, t2,
+ clusterFilter, runSequential);
+ }
+
+ /**
+ * Build a directory of Canopy clusters from the input vectors and other
+ * arguments. Run sequential or mapreduce execution as requested
+ *
+ * @param conf
+ * the Configuration to use
+ * @param input
+ * the Path to the directory containing input vectors
+ * @param output
+ * the Path for all output directories
+ * @param measure
+ * the DistanceMeasure
+ * @param t1
+ * the double T1 distance metric
+ * @param t2
+ * the double T2 distance metric
+ * @param t3
+ * the reducer's double T1 distance metric
+ * @param t4
+ * the reducer's double T2 distance metric
+ * @param clusterFilter
+ * the int minimum size of canopies produced
+ * @param runSequential
+ * a boolean indicates to run the sequential (reference) algorithm
+ * @return the canopy output directory Path
+ */
+ public static Path buildClusters(Configuration conf, Path input, Path output,
+ DistanceMeasure measure, double t1, double t2, double t3, double t4,
+ int clusterFilter, boolean runSequential) throws IOException,
+ InterruptedException, ClassNotFoundException {
+ log.info("Build Clusters Input: {} Out: {} Measure: {} t1: {} t2: {}",
+ input, output, measure, t1, t2);
+ if (runSequential) {
+ return buildClustersSeq(input, output, measure, t1, t2, clusterFilter);
+ } else {
+ return buildClustersMR(conf, input, output, measure, t1, t2, t3, t4,
+ clusterFilter);
+ }
+ }
+
+ /**
+ * Build a directory of Canopy clusters from the input vectors and other
+ * arguments. Run sequential execution
+ *
+ * @param input
+ * the Path to the directory containing input vectors
+ * @param output
+ * the Path for all output directories
+ * @param measure
+ * the DistanceMeasure
+ * @param t1
+ * the double T1 distance metric
+ * @param t2
+ * the double T2 distance metric
+ * @param clusterFilter
+ * the int minimum size of canopies produced
+ * @return the canopy output directory Path
+ */
+ private static Path buildClustersSeq(Path input, Path output,
+ DistanceMeasure measure, double t1, double t2, int clusterFilter)
+ throws IOException {
+ CanopyClusterer clusterer = new CanopyClusterer(measure, t1, t2);
+ Collection<Canopy> canopies = Lists.newArrayList();
+ Configuration conf = new Configuration();
+ FileSystem fs = FileSystem.get(input.toUri(), conf);
+
+ for (VectorWritable vw : new SequenceFileDirValueIterable<VectorWritable>(
+ input, PathType.LIST, PathFilters.logsCRCFilter(), conf)) {
+ clusterer.addPointToCanopies(vw.get(), canopies);
+ }
+
+ Path canopyOutputDir = new Path(output, Cluster.CLUSTERS_DIR + '0' + Cluster.FINAL_ITERATION_SUFFIX);
+ Path path = new Path(canopyOutputDir, "part-r-00000");
+ SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, path,
+ Text.class, ClusterWritable.class);
+ try {
+ ClusterWritable clusterWritable = new ClusterWritable();
+ for (Canopy canopy : canopies) {
+ canopy.computeParameters();
+ if (log.isDebugEnabled()) {
+ log.debug("Writing Canopy:{} center:{} numPoints:{} radius:{}",
+ canopy.getIdentifier(),
+ AbstractCluster.formatVector(canopy.getCenter(), null),
+ canopy.getNumObservations(),
+ AbstractCluster.formatVector(canopy.getRadius(), null));
+ }
+ if (canopy.getNumObservations() > clusterFilter) {
+ clusterWritable.setValue(canopy);
+ writer.append(new Text(canopy.getIdentifier()), clusterWritable);
+ }
+ }
+ } finally {
+ Closeables.close(writer, false);
+ }
+ return canopyOutputDir;
+ }
+
+ /**
+ * Build a directory of Canopy clusters from the input vectors and other
+ * arguments. Run mapreduce execution
+ *
+ * @param conf
+ * the Configuration
+ * @param input
+ * the Path to the directory containing input vectors
+ * @param output
+ * the Path for all output directories
+ * @param measure
+ * the DistanceMeasure
+ * @param t1
+ * the double T1 distance metric
+ * @param t2
+ * the double T2 distance metric
+ * @param t3
+ * the reducer's double T1 distance metric
+ * @param t4
+ * the reducer's double T2 distance metric
+ * @param clusterFilter
+ * the int minimum size of canopies produced
+ * @return the canopy output directory Path
+ */
+ private static Path buildClustersMR(Configuration conf, Path input,
+ Path output, DistanceMeasure measure, double t1, double t2, double t3,
+ double t4, int clusterFilter) throws IOException, InterruptedException,
+ ClassNotFoundException {
+ conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY, measure.getClass()
+ .getName());
+ conf.set(CanopyConfigKeys.T1_KEY, String.valueOf(t1));
+ conf.set(CanopyConfigKeys.T2_KEY, String.valueOf(t2));
+ conf.set(CanopyConfigKeys.T3_KEY, String.valueOf(t3));
+ conf.set(CanopyConfigKeys.T4_KEY, String.valueOf(t4));
+ conf.set(CanopyConfigKeys.CF_KEY, String.valueOf(clusterFilter));
+
+ Job job = new Job(conf, "Canopy Driver running buildClusters over input: "
+ + input);
+ job.setInputFormatClass(SequenceFileInputFormat.class);
+ job.setOutputFormatClass(SequenceFileOutputFormat.class);
+ job.setMapperClass(CanopyMapper.class);
+ job.setMapOutputKeyClass(Text.class);
+ job.setMapOutputValueClass(VectorWritable.class);
+ job.setReducerClass(CanopyReducer.class);
+ job.setOutputKeyClass(Text.class);
+ job.setOutputValueClass(ClusterWritable.class);
+ job.setNumReduceTasks(1);
+ job.setJarByClass(CanopyDriver.class);
+
+ FileInputFormat.addInputPath(job, input);
+ Path canopyOutputDir = new Path(output, Cluster.CLUSTERS_DIR + '0' + Cluster.FINAL_ITERATION_SUFFIX);
+ FileOutputFormat.setOutputPath(job, canopyOutputDir);
+ if (!job.waitForCompletion(true)) {
+ throw new InterruptedException("Canopy Job failed processing " + input);
+ }
+ return canopyOutputDir;
+ }
+
+ private static void clusterData(Configuration conf,
+ Path points,
+ Path canopies,
+ Path output,
+ double clusterClassificationThreshold,
+ boolean runSequential)
+ throws IOException, InterruptedException, ClassNotFoundException {
+ ClusterClassifier.writePolicy(new CanopyClusteringPolicy(), canopies);
+ ClusterClassificationDriver.run(conf, points, output, new Path(output, PathDirectory.CLUSTERED_POINTS_DIRECTORY),
+ clusterClassificationThreshold, true, runSequential);
+ }
+
+}
r***@apache.org
2018-06-28 14:54:38 UTC
Permalink
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/CrossFoldLearner.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/CrossFoldLearner.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/CrossFoldLearner.java
new file mode 100644
index 0000000..f56814b
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/CrossFoldLearner.java
@@ -0,0 +1,334 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.classifier.AbstractVectorClassifier;
+import org.apache.mahout.classifier.OnlineLearner;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.function.DoubleDoubleFunction;
+import org.apache.mahout.math.function.Functions;
+import org.apache.mahout.math.stats.GlobalOnlineAuc;
+import org.apache.mahout.math.stats.OnlineAuc;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Does cross-fold validation of log-likelihood and AUC on several online logistic regression
+ * models. Each record is passed to all but one of the models for training and to the remaining
+ * model for evaluation. In order to maintain proper segregation between the different folds across
+ * training data iterations, data should either be passed to this learner in the same order each
+ * time the training data is traversed or a tracking key such as the file offset of the training
+ * record should be passed with each training example.
+ */
+public class CrossFoldLearner extends AbstractVectorClassifier implements OnlineLearner, Writable {
+ private int record;
+ // minimum score to be used for computing log likelihood
+ private static final double MIN_SCORE = 1.0e-50;
+ private OnlineAuc auc = new GlobalOnlineAuc();
+ private double logLikelihood;
+ private final List<OnlineLogisticRegression> models = new ArrayList<>();
+
+ // lambda, learningRate, perTermOffset, perTermExponent
+ private double[] parameters = new double[4];
+ private int numFeatures;
+ private PriorFunction prior;
+ private double percentCorrect;
+
+ private int windowSize = Integer.MAX_VALUE;
+
+ public CrossFoldLearner() {
+ }
+
+ public CrossFoldLearner(int folds, int numCategories, int numFeatures, PriorFunction prior) {
+ this.numFeatures = numFeatures;
+ this.prior = prior;
+ for (int i = 0; i < folds; i++) {
+ OnlineLogisticRegression model = new OnlineLogisticRegression(numCategories, numFeatures, prior);
+ model.alpha(1).stepOffset(0).decayExponent(0);
+ models.add(model);
+ }
+ }
+
+ // -------- builder-like configuration methods
+
+ public CrossFoldLearner lambda(double v) {
+ for (OnlineLogisticRegression model : models) {
+ model.lambda(v);
+ }
+ return this;
+ }
+
+ public CrossFoldLearner learningRate(double x) {
+ for (OnlineLogisticRegression model : models) {
+ model.learningRate(x);
+ }
+ return this;
+ }
+
+ public CrossFoldLearner stepOffset(int x) {
+ for (OnlineLogisticRegression model : models) {
+ model.stepOffset(x);
+ }
+ return this;
+ }
+
+ public CrossFoldLearner decayExponent(double x) {
+ for (OnlineLogisticRegression model : models) {
+ model.decayExponent(x);
+ }
+ return this;
+ }
+
+ public CrossFoldLearner alpha(double alpha) {
+ for (OnlineLogisticRegression model : models) {
+ model.alpha(alpha);
+ }
+ return this;
+ }
+
+ // -------- training methods
+ @Override
+ public void train(int actual, Vector instance) {
+ train(record, null, actual, instance);
+ }
+
+ @Override
+ public void train(long trackingKey, int actual, Vector instance) {
+ train(trackingKey, null, actual, instance);
+ }
+
+ @Override
+ public void train(long trackingKey, String groupKey, int actual, Vector instance) {
+ record++;
+ int k = 0;
+ for (OnlineLogisticRegression model : models) {
+ if (k == mod(trackingKey, models.size())) {
+ Vector v = model.classifyFull(instance);
+ double score = Math.max(v.get(actual), MIN_SCORE);
+ logLikelihood += (Math.log(score) - logLikelihood) / Math.min(record, windowSize);
+
+ int correct = v.maxValueIndex() == actual ? 1 : 0;
+ percentCorrect += (correct - percentCorrect) / Math.min(record, windowSize);
+ if (numCategories() == 2) {
+ auc.addSample(actual, groupKey, v.get(1));
+ }
+ } else {
+ model.train(trackingKey, groupKey, actual, instance);
+ }
+ k++;
+ }
+ }
+
+ private static long mod(long x, int y) {
+ long r = x % y;
+ return r < 0 ? r + y : r;
+ }
+
+ @Override
+ public void close() {
+ for (OnlineLogisticRegression m : models) {
+ m.close();
+ }
+ }
+
+ public void resetLineCounter() {
+ record = 0;
+ }
+
+ public boolean validModel() {
+ boolean r = true;
+ for (OnlineLogisticRegression model : models) {
+ r &= model.validModel();
+ }
+ return r;
+ }
+
+ // -------- classification methods
+
+ @Override
+ public Vector classify(Vector instance) {
+ Vector r = new DenseVector(numCategories() - 1);
+ DoubleDoubleFunction scale = Functions.plusMult(1.0 / models.size());
+ for (OnlineLogisticRegression model : models) {
+ r.assign(model.classify(instance), scale);
+ }
+ return r;
+ }
+
+ @Override
+ public Vector classifyNoLink(Vector instance) {
+ Vector r = new DenseVector(numCategories() - 1);
+ DoubleDoubleFunction scale = Functions.plusMult(1.0 / models.size());
+ for (OnlineLogisticRegression model : models) {
+ r.assign(model.classifyNoLink(instance), scale);
+ }
+ return r;
+ }
+
+ @Override
+ public double classifyScalar(Vector instance) {
+ double r = 0;
+ int n = 0;
+ for (OnlineLogisticRegression model : models) {
+ n++;
+ r += model.classifyScalar(instance);
+ }
+ return r / n;
+ }
+
+ // -------- status reporting methods
+
+ @Override
+ public int numCategories() {
+ return models.get(0).numCategories();
+ }
+
+ public double auc() {
+ return auc.auc();
+ }
+
+ public double logLikelihood() {
+ return logLikelihood;
+ }
+
+ public double percentCorrect() {
+ return percentCorrect;
+ }
+
+ // -------- evolutionary optimization
+
+ public CrossFoldLearner copy() {
+ CrossFoldLearner r = new CrossFoldLearner(models.size(), numCategories(), numFeatures, prior);
+ r.models.clear();
+ for (OnlineLogisticRegression model : models) {
+ model.close();
+ OnlineLogisticRegression newModel =
+ new OnlineLogisticRegression(model.numCategories(), model.numFeatures(), model.prior);
+ newModel.copyFrom(model);
+ r.models.add(newModel);
+ }
+ return r;
+ }
+
+ public int getRecord() {
+ return record;
+ }
+
+ public void setRecord(int record) {
+ this.record = record;
+ }
+
+ public OnlineAuc getAucEvaluator() {
+ return auc;
+ }
+
+ public void setAucEvaluator(OnlineAuc auc) {
+ this.auc = auc;
+ }
+
+ public double getLogLikelihood() {
+ return logLikelihood;
+ }
+
+ public void setLogLikelihood(double logLikelihood) {
+ this.logLikelihood = logLikelihood;
+ }
+
+ public List<OnlineLogisticRegression> getModels() {
+ return models;
+ }
+
+ public void addModel(OnlineLogisticRegression model) {
+ models.add(model);
+ }
+
+ public double[] getParameters() {
+ return parameters;
+ }
+
+ public void setParameters(double[] parameters) {
+ this.parameters = parameters;
+ }
+
+ public int getNumFeatures() {
+ return numFeatures;
+ }
+
+ public void setNumFeatures(int numFeatures) {
+ this.numFeatures = numFeatures;
+ }
+
+ public void setWindowSize(int windowSize) {
+ this.windowSize = windowSize;
+ auc.setWindowSize(windowSize);
+ }
+
+ public PriorFunction getPrior() {
+ return prior;
+ }
+
+ public void setPrior(PriorFunction prior) {
+ this.prior = prior;
+ }
+
+ @Override
+ public void write(DataOutput out) throws IOException {
+ out.writeInt(record);
+ PolymorphicWritable.write(out, auc);
+ out.writeDouble(logLikelihood);
+ out.writeInt(models.size());
+ for (OnlineLogisticRegression model : models) {
+ model.write(out);
+ }
+
+ for (double x : parameters) {
+ out.writeDouble(x);
+ }
+ out.writeInt(numFeatures);
+ PolymorphicWritable.write(out, prior);
+ out.writeDouble(percentCorrect);
+ out.writeInt(windowSize);
+ }
+
+ @Override
+ public void readFields(DataInput in) throws IOException {
+ record = in.readInt();
+ auc = PolymorphicWritable.read(in, OnlineAuc.class);
+ logLikelihood = in.readDouble();
+ int n = in.readInt();
+ for (int i = 0; i < n; i++) {
+ OnlineLogisticRegression olr = new OnlineLogisticRegression();
+ olr.readFields(in);
+ models.add(olr);
+ }
+ parameters = new double[4];
+ for (int i = 0; i < 4; i++) {
+ parameters[i] = in.readDouble();
+ }
+ numFeatures = in.readInt();
+ prior = PolymorphicWritable.read(in, PriorFunction.class);
+ percentCorrect = in.readDouble();
+ windowSize = in.readInt();
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/CsvRecordFactory.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/CsvRecordFactory.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/CsvRecordFactory.java
new file mode 100644
index 0000000..dbf3198
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/CsvRecordFactory.java
@@ -0,0 +1,395 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import com.google.common.base.Function;
+import com.google.common.base.Preconditions;
+import com.google.common.collect.Collections2;
+import com.google.common.collect.ImmutableMap;
+import com.google.common.collect.Lists;
+
+import org.apache.commons.csv.CSVUtils;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.vectorizer.encoders.ConstantValueEncoder;
+import org.apache.mahout.vectorizer.encoders.ContinuousValueEncoder;
+import org.apache.mahout.vectorizer.encoders.Dictionary;
+import org.apache.mahout.vectorizer.encoders.FeatureVectorEncoder;
+import org.apache.mahout.vectorizer.encoders.StaticWordValueEncoder;
+import org.apache.mahout.vectorizer.encoders.TextValueEncoder;
+
+import java.io.IOException;
+import java.lang.reflect.Constructor;
+import java.lang.reflect.InvocationTargetException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.TreeMap;
+
+/**
+ * Converts CSV data lines to vectors.
+ *
+ * Use of this class proceeds in a few steps.
+ * <ul>
+ * <li> At construction time, you tell the class about the target variable and provide
+ * a dictionary of the types of the predictor values. At this point,
+ * the class yet cannot decode inputs because it doesn't know the fields that are in the
+ * data records, nor their order.
+ * <li> Optionally, you tell the parser object about the possible values of the target
+ * variable. If you don't do this then you probably should set the number of distinct
+ * values so that the target variable values will be taken from a restricted range.
+ * <li> Later, when you get a list of the fields, typically from the first line of a CSV
+ * file, you tell the factory about these fields and it builds internal data structures
+ * that allow it to decode inputs. The most important internal state is the field numbers
+ * for various fields. After this point, you can use the factory for decoding data.
+ * <li> To encode data as a vector, you present a line of input to the factory and it
+ * mutates a vector that you provide. The factory also retains trace information so
+ * that it can approximately reverse engineer vectors later.
+ * <li> After converting data, you can ask for an explanation of the data in terms of
+ * terms and weights. In order to explain a vector accurately, the factory needs to
+ * have seen the particular values of categorical fields (typically during encoding vectors)
+ * and needs to have a reasonably small number of collisions in the vector encoding.
+ * </ul>
+ */
+public class CsvRecordFactory implements RecordFactory {
+ private static final String INTERCEPT_TERM = "Intercept Term";
+
+ private static final Map<String, Class<? extends FeatureVectorEncoder>> TYPE_DICTIONARY =
+ ImmutableMap.<String, Class<? extends FeatureVectorEncoder>>builder()
+ .put("continuous", ContinuousValueEncoder.class)
+ .put("numeric", ContinuousValueEncoder.class)
+ .put("n", ContinuousValueEncoder.class)
+ .put("word", StaticWordValueEncoder.class)
+ .put("w", StaticWordValueEncoder.class)
+ .put("text", TextValueEncoder.class)
+ .put("t", TextValueEncoder.class)
+ .build();
+
+ private final Map<String, Set<Integer>> traceDictionary = new TreeMap<>();
+
+ private int target;
+ private final Dictionary targetDictionary;
+
+ //Which column is used for identify a CSV file line
+ private String idName;
+ private int id = -1;
+
+ private List<Integer> predictors;
+ private Map<Integer, FeatureVectorEncoder> predictorEncoders;
+ private int maxTargetValue = Integer.MAX_VALUE;
+ private final String targetName;
+ private final Map<String, String> typeMap;
+ private List<String> variableNames;
+ private boolean includeBiasTerm;
+ private static final String CANNOT_CONSTRUCT_CONVERTER =
+ "Unable to construct type converter... shouldn't be possible";
+
+ /**
+ * Parse a single line of CSV-formatted text.
+ *
+ * Separated to make changing this functionality for the entire class easier
+ * in the future.
+ * @param line - CSV formatted text
+ * @return List<String>
+ */
+ private List<String> parseCsvLine(String line) {
+ try {
+ return Arrays.asList(CSVUtils.parseLine(line));
+ }
+ catch (IOException e) {
+ List<String> list = new ArrayList<>();
+ list.add(line);
+ return list;
+ }
+ }
+
+ private List<String> parseCsvLine(CharSequence line) {
+ return parseCsvLine(line.toString());
+ }
+
+ /**
+ * Construct a parser for CSV lines that encodes the parsed data in vector form.
+ * @param targetName The name of the target variable.
+ * @param typeMap A map describing the types of the predictor variables.
+ */
+ public CsvRecordFactory(String targetName, Map<String, String> typeMap) {
+ this.targetName = targetName;
+ this.typeMap = typeMap;
+ targetDictionary = new Dictionary();
+ }
+
+ public CsvRecordFactory(String targetName, String idName, Map<String, String> typeMap) {
+ this(targetName, typeMap);
+ this.idName = idName;
+ }
+
+ /**
+ * Defines the values and thus the encoding of values of the target variables. Note
+ * that any values of the target variable not present in this list will be given the
+ * value of the last member of the list.
+ * @param values The values the target variable can have.
+ */
+ @Override
+ public void defineTargetCategories(List<String> values) {
+ Preconditions.checkArgument(
+ values.size() <= maxTargetValue,
+ "Must have less than or equal to " + maxTargetValue + " categories for target variable, but found "
+ + values.size());
+ if (maxTargetValue == Integer.MAX_VALUE) {
+ maxTargetValue = values.size();
+ }
+
+ for (String value : values) {
+ targetDictionary.intern(value);
+ }
+ }
+
+ /**
+ * Defines the number of target variable categories, but allows this parser to
+ * pick encodings for them as they appear.
+ * @param max The number of categories that will be expected. Once this many have been
+ * seen, all others will get the encoding max-1.
+ */
+ @Override
+ public CsvRecordFactory maxTargetValue(int max) {
+ maxTargetValue = max;
+ return this;
+ }
+
+ @Override
+ public boolean usesFirstLineAsSchema() {
+ return true;
+ }
+
+ /**
+ * Processes the first line of a file (which should contain the variable names). The target and
+ * predictor column numbers are set from the names on this line.
+ *
+ * @param line Header line for the file.
+ */
+ @Override
+ public void firstLine(String line) {
+ // read variable names, build map of name -> column
+ final Map<String, Integer> vars = new HashMap<>();
+ variableNames = parseCsvLine(line);
+ int column = 0;
+ for (String var : variableNames) {
+ vars.put(var, column++);
+ }
+
+ // record target column and establish dictionary for decoding target
+ target = vars.get(targetName);
+
+ // record id column
+ if (idName != null) {
+ id = vars.get(idName);
+ }
+
+ // create list of predictor column numbers
+ predictors = new ArrayList<>(Collections2.transform(typeMap.keySet(), new Function<String, Integer>() {
+ @Override
+ public Integer apply(String from) {
+ Integer r = vars.get(from);
+ Preconditions.checkArgument(r != null, "Can't find variable %s, only know about %s", from, vars);
+ return r;
+ }
+ }));
+
+ if (includeBiasTerm) {
+ predictors.add(-1);
+ }
+ Collections.sort(predictors);
+
+ // and map from column number to type encoder for each column that is a predictor
+ predictorEncoders = new HashMap<>();
+ for (Integer predictor : predictors) {
+ String name;
+ Class<? extends FeatureVectorEncoder> c;
+ if (predictor == -1) {
+ name = INTERCEPT_TERM;
+ c = ConstantValueEncoder.class;
+ } else {
+ name = variableNames.get(predictor);
+ c = TYPE_DICTIONARY.get(typeMap.get(name));
+ }
+ try {
+ Preconditions.checkArgument(c != null, "Invalid type of variable %s, wanted one of %s",
+ typeMap.get(name), TYPE_DICTIONARY.keySet());
+ Constructor<? extends FeatureVectorEncoder> constructor = c.getConstructor(String.class);
+ Preconditions.checkArgument(constructor != null, "Can't find correct constructor for %s", typeMap.get(name));
+ FeatureVectorEncoder encoder = constructor.newInstance(name);
+ predictorEncoders.put(predictor, encoder);
+ encoder.setTraceDictionary(traceDictionary);
+ } catch (InstantiationException e) {
+ throw new IllegalStateException(CANNOT_CONSTRUCT_CONVERTER, e);
+ } catch (IllegalAccessException e) {
+ throw new IllegalStateException(CANNOT_CONSTRUCT_CONVERTER, e);
+ } catch (InvocationTargetException e) {
+ throw new IllegalStateException(CANNOT_CONSTRUCT_CONVERTER, e);
+ } catch (NoSuchMethodException e) {
+ throw new IllegalStateException(CANNOT_CONSTRUCT_CONVERTER, e);
+ }
+ }
+ }
+
+
+ /**
+ * Decodes a single line of CSV data and records the target and predictor variables in a record.
+ * As a side effect, features are added into the featureVector. Returns the value of the target
+ * variable.
+ *
+ * @param line The raw data.
+ * @param featureVector Where to fill in the features. Should be zeroed before calling
+ * processLine.
+ * @return The value of the target variable.
+ */
+ @Override
+ public int processLine(String line, Vector featureVector) {
+ List<String> values = parseCsvLine(line);
+
+ int targetValue = targetDictionary.intern(values.get(target));
+ if (targetValue >= maxTargetValue) {
+ targetValue = maxTargetValue - 1;
+ }
+
+ for (Integer predictor : predictors) {
+ String value;
+ if (predictor >= 0) {
+ value = values.get(predictor);
+ } else {
+ value = null;
+ }
+ predictorEncoders.get(predictor).addToVector(value, featureVector);
+ }
+ return targetValue;
+ }
+
+ /***
+ * Decodes a single line of CSV data and records the target(if retrunTarget is true)
+ * and predictor variables in a record. As a side effect, features are added into the featureVector.
+ * Returns the value of the target variable. When used during classify against production data without
+ * target value, the method will be called with returnTarget = false.
+ * @param line The raw data.
+ * @param featureVector Where to fill in the features. Should be zeroed before calling
+ * processLine.
+ * @param returnTarget whether process and return target value, -1 will be returned if false.
+ * @return The value of the target variable.
+ */
+ public int processLine(CharSequence line, Vector featureVector, boolean returnTarget) {
+ List<String> values = parseCsvLine(line);
+ int targetValue = -1;
+ if (returnTarget) {
+ targetValue = targetDictionary.intern(values.get(target));
+ if (targetValue >= maxTargetValue) {
+ targetValue = maxTargetValue - 1;
+ }
+ }
+
+ for (Integer predictor : predictors) {
+ String value = predictor >= 0 ? values.get(predictor) : null;
+ predictorEncoders.get(predictor).addToVector(value, featureVector);
+ }
+ return targetValue;
+ }
+
+ /***
+ * Extract the raw target string from a line read from a CSV file.
+ * @param line the line of content read from CSV file
+ * @return the raw target value in the corresponding column of CSV line
+ */
+ public String getTargetString(CharSequence line) {
+ List<String> values = parseCsvLine(line);
+ return values.get(target);
+
+ }
+
+ /***
+ * Extract the corresponding raw target label according to a code
+ * @param code the integer code encoded during training process
+ * @return the raw target label
+ */
+ public String getTargetLabel(int code) {
+ for (String key : targetDictionary.values()) {
+ if (targetDictionary.intern(key) == code) {
+ return key;
+ }
+ }
+ return null;
+ }
+
+ /***
+ * Extract the id column value from the CSV record
+ * @param line the line of content read from CSV file
+ * @return the id value of the CSV record
+ */
+ public String getIdString(CharSequence line) {
+ List<String> values = parseCsvLine(line);
+ return values.get(id);
+ }
+
+ /**
+ * Returns a list of the names of the predictor variables.
+ *
+ * @return A list of variable names.
+ */
+ @Override
+ public Iterable<String> getPredictors() {
+ return Lists.transform(predictors, new Function<Integer, String>() {
+ @Override
+ public String apply(Integer v) {
+ if (v >= 0) {
+ return variableNames.get(v);
+ } else {
+ return INTERCEPT_TERM;
+ }
+ }
+ });
+ }
+
+ @Override
+ public Map<String, Set<Integer>> getTraceDictionary() {
+ return traceDictionary;
+ }
+
+ @Override
+ public CsvRecordFactory includeBiasTerm(boolean useBias) {
+ includeBiasTerm = useBias;
+ return this;
+ }
+
+ @Override
+ public List<String> getTargetCategories() {
+ List<String> r = targetDictionary.values();
+ if (r.size() > maxTargetValue) {
+ r.subList(maxTargetValue, r.size()).clear();
+ }
+ return r;
+ }
+
+ public String getIdName() {
+ return idName;
+ }
+
+ public void setIdName(String idName) {
+ this.idName = idName;
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/DefaultGradient.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/DefaultGradient.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/DefaultGradient.java
new file mode 100644
index 0000000..f81d8ce
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/DefaultGradient.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import org.apache.mahout.classifier.AbstractVectorClassifier;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.function.Functions;
+
+/**
+ * Implements the basic logistic training law.
+ */
+public class DefaultGradient implements Gradient {
+ /**
+ * Provides a default gradient computation useful for logistic regression.
+ *
+ * @param groupKey A grouping key to allow per-something AUC loss to be used for training.
+ * @param actual The target variable value.
+ * @param instance The current feature vector to use for gradient computation
+ * @param classifier The classifier that can compute scores
+ * @return The gradient to be applied to beta
+ */
+ @Override
+ public final Vector apply(String groupKey, int actual, Vector instance, AbstractVectorClassifier classifier) {
+ // what does the current model say?
+ Vector v = classifier.classify(instance);
+
+ Vector r = v.like();
+ if (actual != 0) {
+ r.setQuick(actual - 1, 1);
+ }
+ r.assign(v, Functions.MINUS);
+ return r;
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/ElasticBandPrior.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/ElasticBandPrior.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/ElasticBandPrior.java
new file mode 100644
index 0000000..8128370
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/ElasticBandPrior.java
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+/**
+ * Implements a linear combination of L1 and L2 priors. This can give an
+ * interesting mixture of sparsity and load-sharing between redundant predictors.
+ */
+public class ElasticBandPrior implements PriorFunction {
+ private double alphaByLambda;
+ private L1 l1;
+ private L2 l2;
+
+ // Exists for Writable
+ public ElasticBandPrior() {
+ this(0.0);
+ }
+
+ public ElasticBandPrior(double alphaByLambda) {
+ this.alphaByLambda = alphaByLambda;
+ l1 = new L1();
+ l2 = new L2(1);
+ }
+
+ @Override
+ public double age(double oldValue, double generations, double learningRate) {
+ oldValue *= Math.pow(1 - alphaByLambda * learningRate, generations);
+ double newValue = oldValue - Math.signum(oldValue) * learningRate * generations;
+ if (newValue * oldValue < 0.0) {
+ // don't allow the value to change sign
+ return 0.0;
+ } else {
+ return newValue;
+ }
+ }
+
+ @Override
+ public double logP(double betaIJ) {
+ return l1.logP(betaIJ) + alphaByLambda * l2.logP(betaIJ);
+ }
+
+ @Override
+ public void write(DataOutput out) throws IOException {
+ out.writeDouble(alphaByLambda);
+ l1.write(out);
+ l2.write(out);
+ }
+
+ @Override
+ public void readFields(DataInput in) throws IOException {
+ alphaByLambda = in.readDouble();
+ l1 = new L1();
+ l1.readFields(in);
+ l2 = new L2();
+ l2.readFields(in);
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/Gradient.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/Gradient.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/Gradient.java
new file mode 100644
index 0000000..524fc06
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/Gradient.java
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import org.apache.mahout.classifier.AbstractVectorClassifier;
+import org.apache.mahout.math.Vector;
+
+/**
+ * Provides the ability to inject a gradient into the SGD logistic regresion.
+ * Typical uses of this are to use a ranking score such as AUC instead of a
+ * normal loss function.
+ */
+public interface Gradient {
+ Vector apply(String groupKey, int actual, Vector instance, AbstractVectorClassifier classifier);
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/GradientMachine.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/GradientMachine.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/GradientMachine.java
new file mode 100644
index 0000000..90ef7a8
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/GradientMachine.java
@@ -0,0 +1,405 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.classifier.AbstractVectorClassifier;
+import org.apache.mahout.classifier.OnlineLearner;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+import org.apache.mahout.math.function.Functions;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.Random;
+
+/**
+ * Online gradient machine learner that tries to minimize the label ranking hinge loss.
+ * Implements a gradient machine with one sigmpod hidden layer.
+ * It tries to minimize the ranking loss of some given set of labels,
+ * so this can be used for multi-class, multi-label
+ * or auto-encoding of sparse data (e.g. text).
+ */
+public class GradientMachine extends AbstractVectorClassifier implements OnlineLearner, Writable {
+
+ public static final int WRITABLE_VERSION = 1;
+
+ // the learning rate of the algorithm
+ private double learningRate = 0.1;
+
+ // the regularization term, a positive number that controls the size of the weight vector
+ private double regularization = 0.1;
+
+ // the sparsity term, a positive number that controls the sparsity of the hidden layer. (0 - 1)
+ private double sparsity = 0.1;
+
+ // the sparsity learning rate.
+ private double sparsityLearningRate = 0.1;
+
+ // the number of features
+ private int numFeatures = 10;
+ // the number of hidden nodes
+ private int numHidden = 100;
+ // the number of output nodes
+ private int numOutput = 2;
+
+ // coefficients for the input to hidden layer.
+ // There are numHidden Vectors of dimension numFeatures.
+ private Vector[] hiddenWeights;
+
+ // coefficients for the hidden to output layer.
+ // There are numOuput Vectors of dimension numHidden.
+ private Vector[] outputWeights;
+
+ // hidden unit bias
+ private Vector hiddenBias;
+
+ // output unit bias
+ private Vector outputBias;
+
+ private final Random rnd;
+
+ public GradientMachine(int numFeatures, int numHidden, int numOutput) {
+ this.numFeatures = numFeatures;
+ this.numHidden = numHidden;
+ this.numOutput = numOutput;
+ hiddenWeights = new DenseVector[numHidden];
+ for (int i = 0; i < numHidden; i++) {
+ hiddenWeights[i] = new DenseVector(numFeatures);
+ hiddenWeights[i].assign(0);
+ }
+ hiddenBias = new DenseVector(numHidden);
+ hiddenBias.assign(0);
+ outputWeights = new DenseVector[numOutput];
+ for (int i = 0; i < numOutput; i++) {
+ outputWeights[i] = new DenseVector(numHidden);
+ outputWeights[i].assign(0);
+ }
+ outputBias = new DenseVector(numOutput);
+ outputBias.assign(0);
+ rnd = RandomUtils.getRandom();
+ }
+
+ /**
+ * Initialize weights.
+ *
+ * @param gen random number generator.
+ */
+ public void initWeights(Random gen) {
+ double hiddenFanIn = 1.0 / Math.sqrt(numFeatures);
+ for (int i = 0; i < numHidden; i++) {
+ for (int j = 0; j < numFeatures; j++) {
+ double val = (2.0 * gen.nextDouble() - 1.0) * hiddenFanIn;
+ hiddenWeights[i].setQuick(j, val);
+ }
+ }
+ double outputFanIn = 1.0 / Math.sqrt(numHidden);
+ for (int i = 0; i < numOutput; i++) {
+ for (int j = 0; j < numHidden; j++) {
+ double val = (2.0 * gen.nextDouble() - 1.0) * outputFanIn;
+ outputWeights[i].setQuick(j, val);
+ }
+ }
+ }
+
+ /**
+ * Chainable configuration option.
+ *
+ * @param learningRate New value of initial learning rate.
+ * @return This, so other configurations can be chained.
+ */
+ public GradientMachine learningRate(double learningRate) {
+ this.learningRate = learningRate;
+ return this;
+ }
+
+ /**
+ * Chainable configuration option.
+ *
+ * @param regularization A positive value that controls the weight vector size.
+ * @return This, so other configurations can be chained.
+ */
+ public GradientMachine regularization(double regularization) {
+ this.regularization = regularization;
+ return this;
+ }
+
+ /**
+ * Chainable configuration option.
+ *
+ * @param sparsity A value between zero and one that controls the fraction of hidden units
+ * that are activated on average.
+ * @return This, so other configurations can be chained.
+ */
+ public GradientMachine sparsity(double sparsity) {
+ this.sparsity = sparsity;
+ return this;
+ }
+
+ /**
+ * Chainable configuration option.
+ *
+ * @param sparsityLearningRate New value of initial learning rate for sparsity.
+ * @return This, so other configurations can be chained.
+ */
+ public GradientMachine sparsityLearningRate(double sparsityLearningRate) {
+ this.sparsityLearningRate = sparsityLearningRate;
+ return this;
+ }
+
+ public void copyFrom(GradientMachine other) {
+ numFeatures = other.numFeatures;
+ numHidden = other.numHidden;
+ numOutput = other.numOutput;
+ learningRate = other.learningRate;
+ regularization = other.regularization;
+ sparsity = other.sparsity;
+ sparsityLearningRate = other.sparsityLearningRate;
+ hiddenWeights = new DenseVector[numHidden];
+ for (int i = 0; i < numHidden; i++) {
+ hiddenWeights[i] = other.hiddenWeights[i].clone();
+ }
+ hiddenBias = other.hiddenBias.clone();
+ outputWeights = new DenseVector[numOutput];
+ for (int i = 0; i < numOutput; i++) {
+ outputWeights[i] = other.outputWeights[i].clone();
+ }
+ outputBias = other.outputBias.clone();
+ }
+
+ @Override
+ public int numCategories() {
+ return numOutput;
+ }
+
+ public int numFeatures() {
+ return numFeatures;
+ }
+
+ public int numHidden() {
+ return numHidden;
+ }
+
+ /**
+ * Feeds forward from input to hidden unit..
+ *
+ * @return Hidden unit activations.
+ */
+ public DenseVector inputToHidden(Vector input) {
+ DenseVector activations = new DenseVector(numHidden);
+ for (int i = 0; i < numHidden; i++) {
+ activations.setQuick(i, hiddenWeights[i].dot(input));
+ }
+ activations.assign(hiddenBias, Functions.PLUS);
+ activations.assign(Functions.min(40.0)).assign(Functions.max(-40));
+ activations.assign(Functions.SIGMOID);
+ return activations;
+ }
+
+ /**
+ * Feeds forward from hidden to output
+ *
+ * @return Output unit activations.
+ */
+ public DenseVector hiddenToOutput(Vector hiddenActivation) {
+ DenseVector activations = new DenseVector(numOutput);
+ for (int i = 0; i < numOutput; i++) {
+ activations.setQuick(i, outputWeights[i].dot(hiddenActivation));
+ }
+ activations.assign(outputBias, Functions.PLUS);
+ return activations;
+ }
+
+ /**
+ * Updates using ranking loss.
+ *
+ * @param hiddenActivation the hidden unit's activation
+ * @param goodLabels the labels you want ranked above others.
+ * @param numTrials how many times you want to search for the highest scoring bad label.
+ * @param gen Random number generator.
+ */
+ public void updateRanking(Vector hiddenActivation,
+ Collection<Integer> goodLabels,
+ int numTrials,
+ Random gen) {
+ // All the labels are good, do nothing.
+ if (goodLabels.size() >= numOutput) {
+ return;
+ }
+ for (Integer good : goodLabels) {
+ double goodScore = outputWeights[good].dot(hiddenActivation);
+ int highestBad = -1;
+ double highestBadScore = Double.NEGATIVE_INFINITY;
+ for (int i = 0; i < numTrials; i++) {
+ int bad = gen.nextInt(numOutput);
+ while (goodLabels.contains(bad)) {
+ bad = gen.nextInt(numOutput);
+ }
+ double badScore = outputWeights[bad].dot(hiddenActivation);
+ if (badScore > highestBadScore) {
+ highestBadScore = badScore;
+ highestBad = bad;
+ }
+ }
+ int bad = highestBad;
+ double loss = 1.0 - goodScore + highestBadScore;
+ if (loss < 0.0) {
+ continue;
+ }
+ // Note from the loss above the gradient dloss/dy , y being the label is -1 for good
+ // and +1 for bad.
+ // dy / dw is just w since y = x' * w + b.
+ // Hence by the chain rule, dloss / dw = dloss / dy * dy / dw = -w.
+ // For the regularization part, 0.5 * lambda * w' w, the gradient is lambda * w.
+ // dy / db = 1.
+ Vector gradGood = outputWeights[good].clone();
+ gradGood.assign(Functions.NEGATE);
+ Vector propHidden = gradGood.clone();
+ Vector gradBad = outputWeights[bad].clone();
+ propHidden.assign(gradBad, Functions.PLUS);
+ gradGood.assign(Functions.mult(-learningRate * (1.0 - regularization)));
+ outputWeights[good].assign(gradGood, Functions.PLUS);
+ gradBad.assign(Functions.mult(-learningRate * (1.0 + regularization)));
+ outputWeights[bad].assign(gradBad, Functions.PLUS);
+ outputBias.setQuick(good, outputBias.get(good) + learningRate);
+ outputBias.setQuick(bad, outputBias.get(bad) - learningRate);
+ // Gradient of sigmoid is s * (1 -s).
+ Vector gradSig = hiddenActivation.clone();
+ gradSig.assign(Functions.SIGMOIDGRADIENT);
+ // Multiply by the change caused by the ranking loss.
+ for (int i = 0; i < numHidden; i++) {
+ gradSig.setQuick(i, gradSig.get(i) * propHidden.get(i));
+ }
+ for (int i = 0; i < numHidden; i++) {
+ for (int j = 0; j < numFeatures; j++) {
+ double v = hiddenWeights[i].get(j);
+ v -= learningRate * (gradSig.get(i) + regularization * v);
+ hiddenWeights[i].setQuick(j, v);
+ }
+ }
+ }
+ }
+
+ @Override
+ public Vector classify(Vector instance) {
+ Vector result = classifyNoLink(instance);
+ // Find the max value's index.
+ int max = result.maxValueIndex();
+ result.assign(0);
+ result.setQuick(max, 1.0);
+ return result.viewPart(1, result.size() - 1);
+ }
+
+ @Override
+ public Vector classifyNoLink(Vector instance) {
+ DenseVector hidden = inputToHidden(instance);
+ return hiddenToOutput(hidden);
+ }
+
+ @Override
+ public double classifyScalar(Vector instance) {
+ Vector output = classifyNoLink(instance);
+ if (output.get(0) > output.get(1)) {
+ return 0;
+ }
+ return 1;
+ }
+
+ public GradientMachine copy() {
+ close();
+ GradientMachine r = new GradientMachine(numFeatures(), numHidden(), numCategories());
+ r.copyFrom(this);
+ return r;
+ }
+
+ @Override
+ public void write(DataOutput out) throws IOException {
+ out.writeInt(WRITABLE_VERSION);
+ out.writeDouble(learningRate);
+ out.writeDouble(regularization);
+ out.writeDouble(sparsity);
+ out.writeDouble(sparsityLearningRate);
+ out.writeInt(numFeatures);
+ out.writeInt(numHidden);
+ out.writeInt(numOutput);
+ VectorWritable.writeVector(out, hiddenBias);
+ for (int i = 0; i < numHidden; i++) {
+ VectorWritable.writeVector(out, hiddenWeights[i]);
+ }
+ VectorWritable.writeVector(out, outputBias);
+ for (int i = 0; i < numOutput; i++) {
+ VectorWritable.writeVector(out, outputWeights[i]);
+ }
+ }
+
+ @Override
+ public void readFields(DataInput in) throws IOException {
+ int version = in.readInt();
+ if (version == WRITABLE_VERSION) {
+ learningRate = in.readDouble();
+ regularization = in.readDouble();
+ sparsity = in.readDouble();
+ sparsityLearningRate = in.readDouble();
+ numFeatures = in.readInt();
+ numHidden = in.readInt();
+ numOutput = in.readInt();
+ hiddenWeights = new DenseVector[numHidden];
+ hiddenBias = VectorWritable.readVector(in);
+ for (int i = 0; i < numHidden; i++) {
+ hiddenWeights[i] = VectorWritable.readVector(in);
+ }
+ outputWeights = new DenseVector[numOutput];
+ outputBias = VectorWritable.readVector(in);
+ for (int i = 0; i < numOutput; i++) {
+ outputWeights[i] = VectorWritable.readVector(in);
+ }
+ } else {
+ throw new IOException("Incorrect object version, wanted " + WRITABLE_VERSION + " got " + version);
+ }
+ }
+
+ @Override
+ public void close() {
+ // This is an online classifier, nothing to do.
+ }
+
+ @Override
+ public void train(long trackingKey, String groupKey, int actual, Vector instance) {
+ Vector hiddenActivation = inputToHidden(instance);
+ hiddenToOutput(hiddenActivation);
+ Collection<Integer> goodLabels = new HashSet<>();
+ goodLabels.add(actual);
+ updateRanking(hiddenActivation, goodLabels, 2, rnd);
+ }
+
+ @Override
+ public void train(long trackingKey, int actual, Vector instance) {
+ train(trackingKey, null, actual, instance);
+ }
+
+ @Override
+ public void train(int actual, Vector instance) {
+ train(0, null, actual, instance);
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/L1.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/L1.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/L1.java
new file mode 100644
index 0000000..28a05f2
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/L1.java
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+/**
+ * Implements the Laplacian or bi-exponential prior. This prior has a strong tendency to set coefficients to zero
+ * and thus is useful as an alternative to variable selection. This version implements truncation which prevents
+ * a coefficient from changing sign. If a correction would change the sign, the coefficient is truncated to zero.
+ *
+ * Note that it doesn't matter to have a scale for this distribution because after taking the derivative of the logP,
+ * the lambda coefficient used to combine the prior with the observations has the same effect. If we had a scale here,
+ * then it would be the same effect as just changing lambda.
+ */
+public class L1 implements PriorFunction {
+ @Override
+ public double age(double oldValue, double generations, double learningRate) {
+ double newValue = oldValue - Math.signum(oldValue) * learningRate * generations;
+ if (newValue * oldValue < 0) {
+ // don't allow the value to change sign
+ return 0;
+ } else {
+ return newValue;
+ }
+ }
+
+ @Override
+ public double logP(double betaIJ) {
+ return -Math.abs(betaIJ);
+ }
+
+ @Override
+ public void write(DataOutput out) throws IOException {
+ // stateless class has nothing to serialize
+ }
+
+ @Override
+ public void readFields(DataInput dataInput) throws IOException {
+ // stateless class has nothing to serialize
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/L2.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/L2.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/L2.java
new file mode 100644
index 0000000..3dfb9fc
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/L2.java
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+/**
+ * Implements the Gaussian prior. This prior has a tendency to decrease large coefficients toward zero, but
+ * doesn't tend to set them to exactly zero.
+ */
+public class L2 implements PriorFunction {
+
+ private static final double HALF_LOG_2PI = Math.log(2.0 * Math.PI) / 2.0;
+
+ private double s2;
+ private double s;
+
+ public L2(double scale) {
+ s = scale;
+ s2 = scale * scale;
+ }
+
+ public L2() {
+ s = 1.0;
+ s2 = 1.0;
+ }
+
+ @Override
+ public double age(double oldValue, double generations, double learningRate) {
+ return oldValue * Math.pow(1.0 - learningRate / s2, generations);
+ }
+
+ @Override
+ public double logP(double betaIJ) {
+ return -betaIJ * betaIJ / s2 / 2.0 - Math.log(s) - HALF_LOG_2PI;
+ }
+
+ @Override
+ public void write(DataOutput out) throws IOException {
+ out.writeDouble(s2);
+ out.writeDouble(s);
+ }
+
+ @Override
+ public void readFields(DataInput in) throws IOException {
+ s2 = in.readDouble();
+ s = in.readDouble();
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/MixedGradient.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/MixedGradient.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/MixedGradient.java
new file mode 100644
index 0000000..a290b22
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/MixedGradient.java
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import org.apache.mahout.classifier.AbstractVectorClassifier;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.math.Vector;
+
+import java.util.Random;
+
+/**
+ * <p>Provides a stochastic mixture of ranking updates and normal logistic updates. This uses a
+ * combination of AUC driven learning to improve ranking performance and traditional log-loss driven
+ * learning to improve log-likelihood.</p>
+ *
+ * <p>See www.eecs.tufts.edu/~dsculley/papers/combined-ranking-and-regression.pdf</p>
+ *
+ * <p>This implementation only makes sense for the binomial case.</p>
+ */
+public class MixedGradient implements Gradient {
+
+ private final double alpha;
+ private final RankingGradient rank;
+ private final Gradient basic;
+ private final Random random = RandomUtils.getRandom();
+ private boolean hasZero;
+ private boolean hasOne;
+
+ public MixedGradient(double alpha, int window) {
+ this.alpha = alpha;
+ this.rank = new RankingGradient(window);
+ this.basic = this.rank.getBaseGradient();
+ }
+
+ @Override
+ public Vector apply(String groupKey, int actual, Vector instance, AbstractVectorClassifier classifier) {
+ if (random.nextDouble() < alpha) {
+ // one option is to apply a ranking update relative to our recent history
+ if (!hasZero || !hasOne) {
+ throw new IllegalStateException();
+ }
+ return rank.apply(groupKey, actual, instance, classifier);
+ } else {
+ hasZero |= actual == 0;
+ hasOne |= actual == 1;
+ // the other option is a normal update, but we have to update our history on the way
+ rank.addToHistory(actual, instance);
+ return basic.apply(groupKey, actual, instance, classifier);
+ }
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/ModelDissector.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/ModelDissector.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/ModelDissector.java
new file mode 100644
index 0000000..bcd2ebc
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/ModelDissector.java
@@ -0,0 +1,232 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import com.google.common.collect.Ordering;
+import org.apache.mahout.classifier.AbstractVectorClassifier;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.math.Vector;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.PriorityQueue;
+import java.util.Queue;
+import java.util.Set;
+
+/**
+ * Uses sample data to reverse engineer a feature-hashed model.
+ *
+ * The result gives approximate weights for features and interactions
+ * in the original space.
+ *
+ * The idea is that the hashed encoders have the option of having a trace dictionary. This
+ * tells us where each feature is hashed to, or each feature/value combination in the case
+ * of word-like values. Using this dictionary, we can put values into a synthetic feature
+ * vector in just the locations specified by a single feature or interaction. Then we can
+ * push this through a linear part of a model to see the contribution of that input. For
+ * any generalized linear model like logistic regression, there is a linear part of the
+ * model that allows this.
+ *
+ * What the ModelDissector does is to accept a trace dictionary and a model in an update
+ * method. It figures out the weights for the elements in the trace dictionary and stashes
+ * them. Then in a summary method, the biggest weights are returned. This update/flush
+ * style is used so that the trace dictionary doesn't have to grow to enormous levels,
+ * but instead can be cleared between updates.
+ */
+public class ModelDissector {
+ private final Map<String,Vector> weightMap;
+
+ public ModelDissector() {
+ weightMap = new HashMap<>();
+ }
+
+ /**
+ * Probes a model to determine the effect of a particular variable. This is done
+ * with the ade of a trace dictionary which has recorded the locations in the feature
+ * vector that are modified by various variable values. We can set these locations to
+ * 1 and then look at the resulting score. This tells us the weight the model places
+ * on that variable.
+ * @param features A feature vector to use (destructively)
+ * @param traceDictionary A trace dictionary containing variables and what locations
+ * in the feature vector are affected by them
+ * @param learner The model that we are probing to find weights on features
+ */
+
+ public void update(Vector features, Map<String, Set<Integer>> traceDictionary, AbstractVectorClassifier learner) {
+ // zero out feature vector
+ features.assign(0);
+ for (Map.Entry<String, Set<Integer>> entry : traceDictionary.entrySet()) {
+ // get a feature and locations where it is stored in the feature vector
+ String key = entry.getKey();
+ Set<Integer> value = entry.getValue();
+
+ // if we haven't looked at this feature yet
+ if (!weightMap.containsKey(key)) {
+ // put probe values in the feature vector
+ for (Integer where : value) {
+ features.set(where, 1);
+ }
+
+ // see what the model says
+ Vector v = learner.classifyNoLink(features);
+ weightMap.put(key, v);
+
+ // and zero out those locations again
+ for (Integer where : value) {
+ features.set(where, 0);
+ }
+ }
+ }
+ }
+
+ /**
+ * Returns the n most important features with their
+ * weights, most important category and the top few
+ * categories that they affect.
+ * @param n How many results to return.
+ * @return A list of the top variables.
+ */
+ public List<Weight> summary(int n) {
+ Queue<Weight> pq = new PriorityQueue<>();
+ for (Map.Entry<String, Vector> entry : weightMap.entrySet()) {
+ pq.add(new Weight(entry.getKey(), entry.getValue()));
+ while (pq.size() > n) {
+ pq.poll();
+ }
+ }
+ List<Weight> r = new ArrayList<>(pq);
+ Collections.sort(r, Ordering.natural().reverse());
+ return r;
+ }
+
+ private static final class Category implements Comparable<Category> {
+ private final int index;
+ private final double weight;
+
+ private Category(int index, double weight) {
+ this.index = index;
+ this.weight = weight;
+ }
+
+ @Override
+ public int compareTo(Category o) {
+ int r = Double.compare(Math.abs(weight), Math.abs(o.weight));
+ if (r == 0) {
+ if (o.index < index) {
+ return -1;
+ }
+ if (o.index > index) {
+ return 1;
+ }
+ return 0;
+ }
+ return r;
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (!(o instanceof Category)) {
+ return false;
+ }
+ Category other = (Category) o;
+ return index == other.index && weight == other.weight;
+ }
+
+ @Override
+ public int hashCode() {
+ return RandomUtils.hashDouble(weight) ^ index;
+ }
+
+ }
+
+ public static class Weight implements Comparable<Weight> {
+ private final String feature;
+ private final double value;
+ private final int maxIndex;
+ private final List<Category> categories;
+
+ public Weight(String feature, Vector weights) {
+ this(feature, weights, 3);
+ }
+
+ public Weight(String feature, Vector weights, int n) {
+ this.feature = feature;
+ // pick out the weight with the largest abs value, but don't forget the sign
+ Queue<Category> biggest = new PriorityQueue<>(n + 1, Ordering.natural());
+ for (Vector.Element element : weights.all()) {
+ biggest.add(new Category(element.index(), element.get()));
+ while (biggest.size() > n) {
+ biggest.poll();
+ }
+ }
+ categories = new ArrayList<>(biggest);
+ Collections.sort(categories, Ordering.natural().reverse());
+ value = categories.get(0).weight;
+ maxIndex = categories.get(0).index;
+ }
+
+ @Override
+ public int compareTo(Weight other) {
+ int r = Double.compare(Math.abs(this.value), Math.abs(other.value));
+ if (r == 0) {
+ return feature.compareTo(other.feature);
+ }
+ return r;
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (!(o instanceof Weight)) {
+ return false;
+ }
+ Weight other = (Weight) o;
+ return feature.equals(other.feature)
+ && value == other.value
+ && maxIndex == other.maxIndex
+ && categories.equals(other.categories);
+ }
+
+ @Override
+ public int hashCode() {
+ return feature.hashCode() ^ RandomUtils.hashDouble(value) ^ maxIndex ^ categories.hashCode();
+ }
+
+ public String getFeature() {
+ return feature;
+ }
+
+ public double getWeight() {
+ return value;
+ }
+
+ public double getWeight(int n) {
+ return categories.get(n).weight;
+ }
+
+ public double getCategory(int n) {
+ return categories.get(n).index;
+ }
+
+ public int getMaxImpact() {
+ return maxIndex;
+ }
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/ModelSerializer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/ModelSerializer.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/ModelSerializer.java
new file mode 100644
index 0000000..f89b245
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/ModelSerializer.java
@@ -0,0 +1,67 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import java.io.DataInput;
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+
+import com.google.common.io.Closeables;
+import org.apache.hadoop.io.Writable;
+
+/**
+ * Provides the ability to store SGD model-related objects as binary files.
+ */
+public final class ModelSerializer {
+
+ // static class ... don't instantiate
+ private ModelSerializer() {
+ }
+
+ public static void writeBinary(String path, CrossFoldLearner model) throws IOException {
+ try (DataOutputStream out = new DataOutputStream(new FileOutputStream(path))) {
+ PolymorphicWritable.write(out, model);
+ }
+ }
+
+ public static void writeBinary(String path, OnlineLogisticRegression model) throws IOException {
+ try (DataOutputStream out = new DataOutputStream(new FileOutputStream(path))) {
+ PolymorphicWritable.write(out, model);
+ }
+ }
+
+ public static void writeBinary(String path, AdaptiveLogisticRegression model) throws IOException {
+ try (DataOutputStream out = new DataOutputStream(new FileOutputStream(path))){
+ PolymorphicWritable.write(out, model);
+ }
+ }
+
+ public static <T extends Writable> T readBinary(InputStream in, Class<T> clazz) throws IOException {
+ DataInput dataIn = new DataInputStream(in);
+ try {
+ return PolymorphicWritable.read(dataIn, clazz);
+ } finally {
+ Closeables.close(in, false);
+ }
+ }
+
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/OnlineLogisticRegression.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/OnlineLogisticRegression.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/OnlineLogisticRegression.java
new file mode 100644
index 0000000..7a9ca83
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/OnlineLogisticRegression.java
@@ -0,0 +1,172 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.math.DenseMatrix;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.MatrixWritable;
+import org.apache.mahout.math.VectorWritable;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+/**
+ * Extends the basic on-line logistic regression learner with a specific set of learning
+ * rate annealing schedules.
+ */
+public class OnlineLogisticRegression extends AbstractOnlineLogisticRegression implements Writable {
+ public static final int WRITABLE_VERSION = 1;
+
+ // these next two control decayFactor^steps exponential type of annealing
+ // learning rate and decay factor
+ private double mu0 = 1;
+ private double decayFactor = 1 - 1.0e-3;
+
+ // these next two control 1/steps^forget type annealing
+ private int stepOffset = 10;
+ // -1 equals even weighting of all examples, 0 means only use exponential annealing
+ private double forgettingExponent = -0.5;
+
+ // controls how per term annealing works
+ private int perTermAnnealingOffset = 20;
+
+ public OnlineLogisticRegression() {
+ // private constructor available for serialization, but not normal use
+ }
+
+ public OnlineLogisticRegression(int numCategories, int numFeatures, PriorFunction prior) {
+ this.numCategories = numCategories;
+ this.prior = prior;
+
+ updateSteps = new DenseVector(numFeatures);
+ updateCounts = new DenseVector(numFeatures).assign(perTermAnnealingOffset);
+ beta = new DenseMatrix(numCategories - 1, numFeatures);
+ }
+
+ /**
+ * Chainable configuration option.
+ *
+ * @param alpha New value of decayFactor, the exponential decay rate for the learning rate.
+ * @return This, so other configurations can be chained.
+ */
+ public OnlineLogisticRegression alpha(double alpha) {
+ this.decayFactor = alpha;
+ return this;
+ }
+
+ @Override
+ public OnlineLogisticRegression lambda(double lambda) {
+ // we only over-ride this to provide a more restrictive return type
+ super.lambda(lambda);
+ return this;
+ }
+
+ /**
+ * Chainable configuration option.
+ *
+ * @param learningRate New value of initial learning rate.
+ * @return This, so other configurations can be chained.
+ */
+ public OnlineLogisticRegression learningRate(double learningRate) {
+ this.mu0 = learningRate;
+ return this;
+ }
+
+ public OnlineLogisticRegression stepOffset(int stepOffset) {
+ this.stepOffset = stepOffset;
+ return this;
+ }
+
+ public OnlineLogisticRegression decayExponent(double decayExponent) {
+ if (decayExponent > 0) {
+ decayExponent = -decayExponent;
+ }
+ this.forgettingExponent = decayExponent;
+ return this;
+ }
+
+
+ @Override
+ public double perTermLearningRate(int j) {
+ return Math.sqrt(perTermAnnealingOffset / updateCounts.get(j));
+ }
+
+ @Override
+ public double currentLearningRate() {
+ return mu0 * Math.pow(decayFactor, getStep()) * Math.pow(getStep() + stepOffset, forgettingExponent);
+ }
+
+ public void copyFrom(OnlineLogisticRegression other) {
+ super.copyFrom(other);
+ mu0 = other.mu0;
+ decayFactor = other.decayFactor;
+
+ stepOffset = other.stepOffset;
+ forgettingExponent = other.forgettingExponent;
+
+ perTermAnnealingOffset = other.perTermAnnealingOffset;
+ }
+
+ public OnlineLogisticRegression copy() {
+ close();
+ OnlineLogisticRegression r = new OnlineLogisticRegression(numCategories(), numFeatures(), prior);
+ r.copyFrom(this);
+ return r;
+ }
+
+ @Override
+ public void write(DataOutput out) throws IOException {
+ out.writeInt(WRITABLE_VERSION);
+ out.writeDouble(mu0);
+ out.writeDouble(getLambda());
+ out.writeDouble(decayFactor);
+ out.writeInt(stepOffset);
+ out.writeInt(step);
+ out.writeDouble(forgettingExponent);
+ out.writeInt(perTermAnnealingOffset);
+ out.writeInt(numCategories);
+ MatrixWritable.writeMatrix(out, beta);
+ PolymorphicWritable.write(out, prior);
+ VectorWritable.writeVector(out, updateCounts);
+ VectorWritable.writeVector(out, updateSteps);
+ }
+
+ @Override
+ public void readFields(DataInput in) throws IOException {
+ int version = in.readInt();
+ if (version == WRITABLE_VERSION) {
+ mu0 = in.readDouble();
+ lambda(in.readDouble());
+ decayFactor = in.readDouble();
+ stepOffset = in.readInt();
+ step = in.readInt();
+ forgettingExponent = in.readDouble();
+ perTermAnnealingOffset = in.readInt();
+ numCategories = in.readInt();
+ beta = MatrixWritable.readMatrix(in);
+ prior = PolymorphicWritable.read(in, PriorFunction.class);
+
+ updateCounts = VectorWritable.readVector(in);
+ updateSteps = VectorWritable.readVector(in);
+ } else {
+ throw new IOException("Incorrect object version, wanted " + WRITABLE_VERSION + " got " + version);
+ }
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/PassiveAggressive.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/PassiveAggressive.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/PassiveAggressive.java
new file mode 100644
index 0000000..c51361c
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/PassiveAggressive.java
@@ -0,0 +1,204 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.classifier.AbstractVectorClassifier;
+import org.apache.mahout.classifier.OnlineLearner;
+import org.apache.mahout.math.DenseMatrix;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.Matrix;
+import org.apache.mahout.math.MatrixWritable;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.function.Functions;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+/**
+ * Online passive aggressive learner that tries to minimize the label ranking hinge loss.
+ * Implements a multi-class linear classifier minimizing rank loss.
+ * based on "Online passive aggressive algorithms" by Cramer et al, 2006.
+ * Note: Its better to use classifyNoLink because the loss function is based
+ * on ensuring that the score of the good label is larger than the next
+ * highest label by some margin. The conversion to probability is just done
+ * by exponentiating and dividing by the sum and is empirical at best.
+ * Your features should be pre-normalized in some sensible range, for example,
+ * by subtracting the mean and standard deviation, if they are very
+ * different in magnitude from each other.
+ */
+public class PassiveAggressive extends AbstractVectorClassifier implements OnlineLearner, Writable {
+
+ private static final Logger log = LoggerFactory.getLogger(PassiveAggressive.class);
+
+ public static final int WRITABLE_VERSION = 1;
+
+ // the learning rate of the algorithm
+ private double learningRate = 0.1;
+
+ // loss statistics.
+ private int lossCount = 0;
+ private double lossSum = 0;
+
+ // coefficients for the classification. This is a dense matrix
+ // that is (numCategories ) x numFeatures
+ private Matrix weights;
+
+ // number of categories we are classifying.
+ private int numCategories;
+
+ public PassiveAggressive(int numCategories, int numFeatures) {
+ this.numCategories = numCategories;
+ weights = new DenseMatrix(numCategories, numFeatures);
+ weights.assign(0.0);
+ }
+
+ /**
+ * Chainable configuration option.
+ *
+ * @param learningRate New value of initial learning rate.
+ * @return This, so other configurations can be chained.
+ */
+ public PassiveAggressive learningRate(double learningRate) {
+ this.learningRate = learningRate;
+ return this;
+ }
+
+ public void copyFrom(PassiveAggressive other) {
+ learningRate = other.learningRate;
+ numCategories = other.numCategories;
+ weights = other.weights;
+ }
+
+ @Override
+ public int numCategories() {
+ return numCategories;
+ }
+
+ @Override
+ public Vector classify(Vector instance) {
+ Vector result = classifyNoLink(instance);
+ // Convert to probabilities by exponentiation.
+ double max = result.maxValue();
+ result.assign(Functions.minus(max)).assign(Functions.EXP);
+ result = result.divide(result.norm(1));
+
+ return result.viewPart(1, result.size() - 1);
+ }
+
+ @Override
+ public Vector classifyNoLink(Vector instance) {
+ Vector result = new DenseVector(weights.numRows());
+ result.assign(0);
+ for (int i = 0; i < weights.numRows(); i++) {
+ result.setQuick(i, weights.viewRow(i).dot(instance));
+ }
+ return result;
+ }
+
+ @Override
+ public double classifyScalar(Vector instance) {
+ double v1 = weights.viewRow(0).dot(instance);
+ double v2 = weights.viewRow(1).dot(instance);
+ v1 = Math.exp(v1);
+ v2 = Math.exp(v2);
+ return v2 / (v1 + v2);
+ }
+
+ public int numFeatures() {
+ return weights.numCols();
+ }
+
+ public PassiveAggressive copy() {
+ close();
+ PassiveAggressive r = new PassiveAggressive(numCategories(), numFeatures());
+ r.copyFrom(this);
+ return r;
+ }
+
+ @Override
+ public void write(DataOutput out) throws IOException {
+ out.writeInt(WRITABLE_VERSION);
+ out.writeDouble(learningRate);
+ out.writeInt(numCategories);
+ MatrixWritable.writeMatrix(out, weights);
+ }
+
+ @Override
+ public void readFields(DataInput in) throws IOException {
+ int version = in.readInt();
+ if (version == WRITABLE_VERSION) {
+ learningRate = in.readDouble();
+ numCategories = in.readInt();
+ weights = MatrixWritable.readMatrix(in);
+ } else {
+ throw new IOException("Incorrect object version, wanted " + WRITABLE_VERSION + " got " + version);
+ }
+ }
+
+ @Override
+ public void close() {
+ // This is an online classifier, nothing to do.
+ }
+
+ @Override
+ public void train(long trackingKey, String groupKey, int actual, Vector instance) {
+ if (lossCount > 1000) {
+ log.info("Avg. Loss = {}", lossSum / lossCount);
+ lossCount = 0;
+ lossSum = 0;
+ }
+ Vector result = classifyNoLink(instance);
+ double myScore = result.get(actual);
+ // Find the highest score that is not actual.
+ int otherIndex = result.maxValueIndex();
+ double otherValue = result.get(otherIndex);
+ if (otherIndex == actual) {
+ result.setQuick(otherIndex, Double.NEGATIVE_INFINITY);
+ otherIndex = result.maxValueIndex();
+ otherValue = result.get(otherIndex);
+ }
+ double loss = 1.0 - myScore + otherValue;
+ lossCount += 1;
+ if (loss >= 0) {
+ lossSum += loss;
+ double tau = loss / (instance.dot(instance) + 0.5 / learningRate);
+ Vector delta = instance.clone();
+ delta.assign(Functions.mult(tau));
+ weights.viewRow(actual).assign(delta, Functions.PLUS);
+// delta.addTo(weights.viewRow(actual));
+ delta.assign(Functions.mult(-1));
+ weights.viewRow(otherIndex).assign(delta, Functions.PLUS);
+// delta.addTo(weights.viewRow(otherIndex));
+ }
+ }
+
+ @Override
+ public void train(long trackingKey, int actual, Vector instance) {
+ train(trackingKey, null, actual, instance);
+ }
+
+ @Override
+ public void train(int actual, Vector instance) {
+ train(0, null, actual, instance);
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/PolymorphicWritable.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/PolymorphicWritable.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/PolymorphicWritable.java
new file mode 100644
index 0000000..90062a6
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/PolymorphicWritable.java
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.common.ClassUtils;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+/**
+ * Utilities that write a class name and then serialize using writables.
+ */
+public final class PolymorphicWritable {
+
+ private PolymorphicWritable() {
+ }
+
+ public static <T extends Writable> void write(DataOutput dataOutput, T value) throws IOException {
+ dataOutput.writeUTF(value.getClass().getName());
+ value.write(dataOutput);
+ }
+
+ public static <T extends Writable> T read(DataInput dataInput, Class<? extends T> clazz) throws IOException {
+ String className = dataInput.readUTF();
+ T r = ClassUtils.instantiateAs(className, clazz);
+ r.readFields(dataInput);
+ return r;
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/PriorFunction.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/PriorFunction.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/PriorFunction.java
new file mode 100644
index 0000000..857f061
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/PriorFunction.java
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import org.apache.hadoop.io.Writable;
+
+/**
+ * A prior is used to regularize the learning algorithm. This allows a trade-off to
+ * be made between complexity of the model being learned and the accuracy with which
+ * the model fits the training data. There are different definitions of complexity
+ * which can be approximated using different priors. For large sparse systems, such
+ * as text classification, the L1 prior is often used which favors sparse models.
+ */
+public interface PriorFunction extends Writable {
+ /**
+ * Applies the regularization to a coefficient.
+ * @param oldValue The previous value.
+ * @param generations The number of generations.
+ * @param learningRate The learning rate with lambda baked in.
+ * @return The new coefficient value after regularization.
+ */
+ double age(double oldValue, double generations, double learningRate);
+
+ /**
+ * Returns the log of the probability of a particular coefficient value according to the prior.
+ * @param betaIJ The coefficient.
+ * @return The log probability.
+ */
+ double logP(double betaIJ);
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/RankingGradient.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/RankingGradient.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/RankingGradient.java
new file mode 100644
index 0000000..a04fc8b
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/sgd/RankingGradient.java
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import org.apache.mahout.classifier.AbstractVectorClassifier;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.function.Functions;
+
+import java.util.ArrayDeque;
+import java.util.ArrayList;
+import java.util.Deque;
+import java.util.List;
+
+/**
+ * Uses the difference between this instance and recent history to get a
+ * gradient that optimizes ranking performance. Essentially this is the
+ * same as directly optimizing AUC. It isn't expected that this would
+ * be used alone, but rather that a MixedGradient would use it and a
+ * DefaultGradient together to combine both ranking and log-likelihood
+ * goals.
+ */
+public class RankingGradient implements Gradient {
+
+ private static final Gradient BASIC = new DefaultGradient();
+
+ private int window = 10;
+
+ private final List<Deque<Vector>> history = new ArrayList<>();
+
+ public RankingGradient(int window) {
+ this.window = window;
+ }
+
+ @Override
+ public final Vector apply(String groupKey, int actual, Vector instance, AbstractVectorClassifier classifier) {
+ addToHistory(actual, instance);
+
+ // now compute average gradient versus saved vectors from the other side
+ Deque<Vector> otherSide = history.get(1 - actual);
+ int n = otherSide.size();
+
+ Vector r = null;
+ for (Vector other : otherSide) {
+ Vector g = BASIC.apply(groupKey, actual, instance.minus(other), classifier);
+
+ if (r == null) {
+ r = g;
+ } else {
+ r.assign(g, Functions.plusMult(1.0 / n));
+ }
+ }
+ return r;
+ }
+
+ public void addToHistory(int actual, Vector instance) {
+ while (history.size() <= actual) {
+ history.add(new ArrayDeque<Vector>(window));
+ }
+ // save this instance
+ Deque<Vector> ourSide = history.get(actual);
+ ourSide.add(instance);
+ while (ourSide.size() >= window) {
+ ourSide.pollFirst();
+ }
+ }
+
+ public Gradient getBaseGradient() {
+ return BASIC;
+ }
+}
r***@apache.org
2018-06-28 14:54:41 UTC
Permalink
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/split/OptIgSplit.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/split/OptIgSplit.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/split/OptIgSplit.java
new file mode 100644
index 0000000..56b1a04
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/split/OptIgSplit.java
@@ -0,0 +1,232 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.split;
+
+import org.apache.commons.math3.stat.descriptive.rank.Percentile;
+import org.apache.mahout.classifier.df.data.Data;
+import org.apache.mahout.classifier.df.data.DataUtils;
+import org.apache.mahout.classifier.df.data.Dataset;
+import org.apache.mahout.classifier.df.data.Instance;
+
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.TreeSet;
+
+/**
+ * <p>Optimized implementation of IgSplit.
+ * This class can be used when the criterion variable is the categorical attribute.</p>
+ *
+ * <p>This code was changed in MAHOUT-1419 to deal in sampled splits among numeric
+ * features to fix a performance problem. To generate some synthetic data that exercises
+ * the issue, try for example generating 4 features of Normal(0,1) values with a random
+ * boolean 0/1 categorical feature. In Scala:</p>
+ *
+ * {@code
+ * val r = new scala.util.Random()
+ * val pw = new java.io.PrintWriter("random.csv")
+ * (1 to 10000000).foreach(e =>
+ * pw.println(r.nextDouble() + "," +
+ * r.nextDouble() + "," +
+ * r.nextDouble() + "," +
+ * r.nextDouble() + "," +
+ * (if (r.nextBoolean()) 1 else 0))
+ * )
+ * pw.close()
+ * }
+ */
+@Deprecated
+public class OptIgSplit extends IgSplit {
+
+ private static final int MAX_NUMERIC_SPLITS = 16;
+
+ @Override
+ public Split computeSplit(Data data, int attr) {
+ if (data.getDataset().isNumerical(attr)) {
+ return numericalSplit(data, attr);
+ } else {
+ return categoricalSplit(data, attr);
+ }
+ }
+
+ /**
+ * Computes the split for a CATEGORICAL attribute
+ */
+ private static Split categoricalSplit(Data data, int attr) {
+ double[] values = data.values(attr).clone();
+
+ double[] splitPoints = chooseCategoricalSplitPoints(values);
+
+ int numLabels = data.getDataset().nblabels();
+ int[][] counts = new int[splitPoints.length][numLabels];
+ int[] countAll = new int[numLabels];
+
+ computeFrequencies(data, attr, splitPoints, counts, countAll);
+
+ int size = data.size();
+ double hy = entropy(countAll, size); // H(Y)
+ double hyx = 0.0; // H(Y|X)
+ double invDataSize = 1.0 / size;
+
+ for (int index = 0; index < splitPoints.length; index++) {
+ size = DataUtils.sum(counts[index]);
+ hyx += size * invDataSize * entropy(counts[index], size);
+ }
+
+ double ig = hy - hyx;
+ return new Split(attr, ig);
+ }
+
+ static void computeFrequencies(Data data,
+ int attr,
+ double[] splitPoints,
+ int[][] counts,
+ int[] countAll) {
+ Dataset dataset = data.getDataset();
+
+ for (int index = 0; index < data.size(); index++) {
+ Instance instance = data.get(index);
+ int label = (int) dataset.getLabel(instance);
+ double value = instance.get(attr);
+ int split = 0;
+ while (split < splitPoints.length && value > splitPoints[split]) {
+ split++;
+ }
+ if (split < splitPoints.length) {
+ counts[split][label]++;
+ } // Otherwise it's in the last split, which we don't need to count
+ countAll[label]++;
+ }
+ }
+
+ /**
+ * Computes the best split for a NUMERICAL attribute
+ */
+ static Split numericalSplit(Data data, int attr) {
+ double[] values = data.values(attr).clone();
+ Arrays.sort(values);
+
+ double[] splitPoints = chooseNumericSplitPoints(values);
+
+ int numLabels = data.getDataset().nblabels();
+ int[][] counts = new int[splitPoints.length][numLabels];
+ int[] countAll = new int[numLabels];
+ int[] countLess = new int[numLabels];
+
+ computeFrequencies(data, attr, splitPoints, counts, countAll);
+
+ int size = data.size();
+ double hy = entropy(countAll, size);
+ double invDataSize = 1.0 / size;
+
+ int best = -1;
+ double bestIg = -1.0;
+
+ // try each possible split value
+ for (int index = 0; index < splitPoints.length; index++) {
+ double ig = hy;
+
+ DataUtils.add(countLess, counts[index]);
+ DataUtils.dec(countAll, counts[index]);
+
+ // instance with attribute value < values[index]
+ size = DataUtils.sum(countLess);
+ ig -= size * invDataSize * entropy(countLess, size);
+ // instance with attribute value >= values[index]
+ size = DataUtils.sum(countAll);
+ ig -= size * invDataSize * entropy(countAll, size);
+
+ if (ig > bestIg) {
+ bestIg = ig;
+ best = index;
+ }
+ }
+
+ if (best == -1) {
+ throw new IllegalStateException("no best split found !");
+ }
+ return new Split(attr, bestIg, splitPoints[best]);
+ }
+
+ /**
+ * @return an array of values to split the numeric feature's values on when
+ * building candidate splits. When input size is <= MAX_NUMERIC_SPLITS + 1, it will
+ * return the averages between success values as split points. When larger, it will
+ * return MAX_NUMERIC_SPLITS approximate percentiles through the data.
+ */
+ private static double[] chooseNumericSplitPoints(double[] values) {
+ if (values.length <= 1) {
+ return values;
+ }
+ if (values.length <= MAX_NUMERIC_SPLITS + 1) {
+ double[] splitPoints = new double[values.length - 1];
+ for (int i = 1; i < values.length; i++) {
+ splitPoints[i-1] = (values[i] + values[i-1]) / 2.0;
+ }
+ return splitPoints;
+ }
+ Percentile distribution = new Percentile();
+ distribution.setData(values);
+ double[] percentiles = new double[MAX_NUMERIC_SPLITS];
+ for (int i = 0 ; i < percentiles.length; i++) {
+ double p = 100.0 * ((i + 1.0) / (MAX_NUMERIC_SPLITS + 1.0));
+ percentiles[i] = distribution.evaluate(p);
+ }
+ return percentiles;
+ }
+
+ private static double[] chooseCategoricalSplitPoints(double[] values) {
+ // There is no great reason to believe that categorical value order matters,
+ // but the original code worked this way, and it's not terrible in the absence
+ // of more sophisticated analysis
+ Collection<Double> uniqueOrderedCategories = new TreeSet<>();
+ for (double v : values) {
+ uniqueOrderedCategories.add(v);
+ }
+ double[] uniqueValues = new double[uniqueOrderedCategories.size()];
+ Iterator<Double> it = uniqueOrderedCategories.iterator();
+ for (int i = 0; i < uniqueValues.length; i++) {
+ uniqueValues[i] = it.next();
+ }
+ return uniqueValues;
+ }
+
+ /**
+ * Computes the Entropy
+ *
+ * @param counts counts[i] = numInstances with label i
+ * @param dataSize numInstances
+ */
+ private static double entropy(int[] counts, int dataSize) {
+ if (dataSize == 0) {
+ return 0.0;
+ }
+
+ double entropy = 0.0;
+
+ for (int count : counts) {
+ if (count > 0) {
+ double p = count / (double) dataSize;
+ entropy -= p * Math.log(p);
+ }
+ }
+
+ return entropy / LOG2;
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/split/RegressionSplit.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/split/RegressionSplit.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/split/RegressionSplit.java
new file mode 100644
index 0000000..38695a3
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/split/RegressionSplit.java
@@ -0,0 +1,177 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.split;
+
+import org.apache.mahout.cf.taste.impl.common.FullRunningAverage;
+import org.apache.mahout.classifier.df.data.Data;
+import org.apache.mahout.classifier.df.data.Instance;
+
+import java.io.Serializable;
+import java.util.Arrays;
+import java.util.Comparator;
+
+/**
+ * Regression problem implementation of IgSplit. This class can be used when the criterion variable is the numerical
+ * attribute.
+ */
+@Deprecated
+public class RegressionSplit extends IgSplit {
+
+ /**
+ * Comparator for Instance sort
+ */
+ private static class InstanceComparator implements Comparator<Instance>, Serializable {
+ private final int attr;
+
+ InstanceComparator(int attr) {
+ this.attr = attr;
+ }
+
+ @Override
+ public int compare(Instance arg0, Instance arg1) {
+ return Double.compare(arg0.get(attr), arg1.get(attr));
+ }
+ }
+
+ @Override
+ public Split computeSplit(Data data, int attr) {
+ if (data.getDataset().isNumerical(attr)) {
+ return numericalSplit(data, attr);
+ } else {
+ return categoricalSplit(data, attr);
+ }
+ }
+
+ /**
+ * Computes the split for a CATEGORICAL attribute
+ */
+ private static Split categoricalSplit(Data data, int attr) {
+ FullRunningAverage[] ra = new FullRunningAverage[data.getDataset().nbValues(attr)];
+ double[] sk = new double[data.getDataset().nbValues(attr)];
+ for (int i = 0; i < ra.length; i++) {
+ ra[i] = new FullRunningAverage();
+ }
+ FullRunningAverage totalRa = new FullRunningAverage();
+ double totalSk = 0.0;
+
+ for (int i = 0; i < data.size(); i++) {
+ // computes the variance
+ Instance instance = data.get(i);
+ int value = (int) instance.get(attr);
+ double xk = data.getDataset().getLabel(instance);
+ if (ra[value].getCount() == 0) {
+ ra[value].addDatum(xk);
+ sk[value] = 0.0;
+ } else {
+ double mk = ra[value].getAverage();
+ ra[value].addDatum(xk);
+ sk[value] += (xk - mk) * (xk - ra[value].getAverage());
+ }
+
+ // total variance
+ if (i == 0) {
+ totalRa.addDatum(xk);
+ totalSk = 0.0;
+ } else {
+ double mk = totalRa.getAverage();
+ totalRa.addDatum(xk);
+ totalSk += (xk - mk) * (xk - totalRa.getAverage());
+ }
+ }
+
+ // computes the variance gain
+ double ig = totalSk;
+ for (double aSk : sk) {
+ ig -= aSk;
+ }
+
+ return new Split(attr, ig);
+ }
+
+ /**
+ * Computes the best split for a NUMERICAL attribute
+ */
+ private static Split numericalSplit(Data data, int attr) {
+ FullRunningAverage[] ra = new FullRunningAverage[2];
+ for (int i = 0; i < ra.length; i++) {
+ ra[i] = new FullRunningAverage();
+ }
+
+ // Instance sort
+ Instance[] instances = new Instance[data.size()];
+ for (int i = 0; i < data.size(); i++) {
+ instances[i] = data.get(i);
+ }
+ Arrays.sort(instances, new InstanceComparator(attr));
+
+ double[] sk = new double[2];
+ for (Instance instance : instances) {
+ double xk = data.getDataset().getLabel(instance);
+ if (ra[1].getCount() == 0) {
+ ra[1].addDatum(xk);
+ sk[1] = 0.0;
+ } else {
+ double mk = ra[1].getAverage();
+ ra[1].addDatum(xk);
+ sk[1] += (xk - mk) * (xk - ra[1].getAverage());
+ }
+ }
+ double totalSk = sk[1];
+
+ // find the best split point
+ double split = Double.NaN;
+ double preSplit = Double.NaN;
+ double bestVal = Double.MAX_VALUE;
+ double bestSk = 0.0;
+
+ // computes total variance
+ for (Instance instance : instances) {
+ double xk = data.getDataset().getLabel(instance);
+
+ if (instance.get(attr) > preSplit) {
+ double curVal = sk[0] / ra[0].getCount() + sk[1] / ra[1].getCount();
+ if (curVal < bestVal) {
+ bestVal = curVal;
+ bestSk = sk[0] + sk[1];
+ split = (instance.get(attr) + preSplit) / 2.0;
+ }
+ }
+
+ // computes the variance
+ if (ra[0].getCount() == 0) {
+ ra[0].addDatum(xk);
+ sk[0] = 0.0;
+ } else {
+ double mk = ra[0].getAverage();
+ ra[0].addDatum(xk);
+ sk[0] += (xk - mk) * (xk - ra[0].getAverage());
+ }
+
+ double mk = ra[1].getAverage();
+ ra[1].removeDatum(xk);
+ sk[1] -= (xk - mk) * (xk - ra[1].getAverage());
+
+ preSplit = instance.get(attr);
+ }
+
+ // computes the variance gain
+ double ig = totalSk - bestSk;
+
+ return new Split(attr, ig, split);
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/split/Split.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/split/Split.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/split/Split.java
new file mode 100644
index 0000000..2a6a322
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/split/Split.java
@@ -0,0 +1,68 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.split;
+
+import java.util.Locale;
+
+/**
+ * Contains enough information to identify each split
+ */
+@Deprecated
+public final class Split {
+
+ private final int attr;
+ private final double ig;
+ private final double split;
+
+ public Split(int attr, double ig, double split) {
+ this.attr = attr;
+ this.ig = ig;
+ this.split = split;
+ }
+
+ public Split(int attr, double ig) {
+ this(attr, ig, Double.NaN);
+ }
+
+ /**
+ * @return attribute to split for
+ */
+ public int getAttr() {
+ return attr;
+ }
+
+ /**
+ * @return Information Gain of the split
+ */
+ public double getIg() {
+ return ig;
+ }
+
+ /**
+ * @return split value for NUMERICAL attributes
+ */
+ public double getSplit() {
+ return split;
+ }
+
+ @Override
+ public String toString() {
+ return String.format(Locale.ENGLISH, "attr: %d, ig: %f, split: %f", attr, ig, split);
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/tools/Describe.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/tools/Describe.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/tools/Describe.java
new file mode 100644
index 0000000..f29faed
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/tools/Describe.java
@@ -0,0 +1,166 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.tools;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.OptionException;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.commandline.Parser;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.classifier.df.DFUtils;
+import org.apache.mahout.classifier.df.data.DataLoader;
+import org.apache.mahout.classifier.df.data.Dataset;
+import org.apache.mahout.classifier.df.data.DescriptorException;
+import org.apache.mahout.classifier.df.data.DescriptorUtils;
+import org.apache.mahout.common.CommandLineUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Generates a file descriptor for a given dataset
+ */
+public final class Describe implements Tool {
+
+ private static final Logger log = LoggerFactory.getLogger(Describe.class);
+
+ private Describe() {}
+
+ public static int main(String[] args) throws Exception {
+ return ToolRunner.run(new Describe(), args);
+ }
+
+ @Override
+ public int run(String[] args) throws Exception {
+ DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
+ ArgumentBuilder abuilder = new ArgumentBuilder();
+ GroupBuilder gbuilder = new GroupBuilder();
+
+ Option pathOpt = obuilder.withLongName("path").withShortName("p").withRequired(true).withArgument(
+ abuilder.withName("path").withMinimum(1).withMaximum(1).create()).withDescription("Data path").create();
+
+ Option descriptorOpt = obuilder.withLongName("descriptor").withShortName("d").withRequired(true)
+ .withArgument(abuilder.withName("descriptor").withMinimum(1).create()).withDescription(
+ "data descriptor").create();
+
+ Option descPathOpt = obuilder.withLongName("file").withShortName("f").withRequired(true).withArgument(
+ abuilder.withName("file").withMinimum(1).withMaximum(1).create()).withDescription(
+ "Path to generated descriptor file").create();
+
+ Option regOpt = obuilder.withLongName("regression").withDescription("Regression Problem").withShortName("r")
+ .create();
+
+ Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
+ .create();
+
+ Group group = gbuilder.withName("Options").withOption(pathOpt).withOption(descPathOpt).withOption(
+ descriptorOpt).withOption(regOpt).withOption(helpOpt).create();
+
+ try {
+ Parser parser = new Parser();
+ parser.setGroup(group);
+ CommandLine cmdLine = parser.parse(args);
+
+ if (cmdLine.hasOption(helpOpt)) {
+ CommandLineUtil.printHelp(group);
+ return -1;
+ }
+
+ String dataPath = cmdLine.getValue(pathOpt).toString();
+ String descPath = cmdLine.getValue(descPathOpt).toString();
+ List<String> descriptor = convert(cmdLine.getValues(descriptorOpt));
+ boolean regression = cmdLine.hasOption(regOpt);
+
+ log.debug("Data path : {}", dataPath);
+ log.debug("Descriptor path : {}", descPath);
+ log.debug("Descriptor : {}", descriptor);
+ log.debug("Regression : {}", regression);
+
+ runTool(dataPath, descriptor, descPath, regression);
+ } catch (OptionException e) {
+ log.warn(e.toString());
+ CommandLineUtil.printHelp(group);
+ }
+ return 0;
+ }
+
+ private void runTool(String dataPath, Iterable<String> description, String filePath, boolean regression)
+ throws DescriptorException, IOException {
+ log.info("Generating the descriptor...");
+ String descriptor = DescriptorUtils.generateDescriptor(description);
+
+ Path fPath = validateOutput(filePath);
+
+ log.info("generating the dataset...");
+ Dataset dataset = generateDataset(descriptor, dataPath, regression);
+
+ log.info("storing the dataset description");
+ String json = dataset.toJSON();
+ DFUtils.storeString(conf, fPath, json);
+ }
+
+ private Dataset generateDataset(String descriptor, String dataPath, boolean regression) throws IOException,
+ DescriptorException {
+ Path path = new Path(dataPath);
+ FileSystem fs = path.getFileSystem(conf);
+
+ return DataLoader.generateDataset(descriptor, regression, fs, path);
+ }
+
+ private Path validateOutput(String filePath) throws IOException {
+ Path path = new Path(filePath);
+ FileSystem fs = path.getFileSystem(conf);
+ if (fs.exists(path)) {
+ throw new IllegalStateException("Descriptor's file already exists");
+ }
+
+ return path;
+ }
+
+ private static List<String> convert(Collection<?> values) {
+ List<String> list = new ArrayList<>(values.size());
+ for (Object value : values) {
+ list.add(value.toString());
+ }
+ return list;
+ }
+
+ private Configuration conf;
+
+ @Override
+ public void setConf(Configuration entries) {
+ this.conf = entries;
+ }
+
+ @Override
+ public Configuration getConf() {
+ return conf;
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/tools/ForestVisualizer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/tools/ForestVisualizer.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/tools/ForestVisualizer.java
new file mode 100644
index 0000000..b421c4e
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/tools/ForestVisualizer.java
@@ -0,0 +1,158 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.tools;
+
+import java.io.IOException;
+import java.lang.reflect.InvocationTargetException;
+import java.lang.reflect.Method;
+import java.util.Collection;
+import java.util.List;
+
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.commandline.Parser;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.mahout.classifier.df.DecisionForest;
+import org.apache.mahout.classifier.df.data.Dataset;
+import org.apache.mahout.classifier.df.node.Node;
+import org.apache.mahout.common.CommandLineUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This tool is to visualize the Decision Forest
+ */
+@Deprecated
+public final class ForestVisualizer {
+
+ private static final Logger log = LoggerFactory.getLogger(ForestVisualizer.class);
+
+ private ForestVisualizer() {
+ }
+
+ public static String toString(DecisionForest forest, Dataset dataset, String[] attrNames) {
+
+ List<Node> trees;
+ try {
+ Method getTrees = forest.getClass().getDeclaredMethod("getTrees");
+ getTrees.setAccessible(true);
+ trees = (List<Node>) getTrees.invoke(forest);
+ } catch (IllegalAccessException e) {
+ throw new IllegalStateException(e);
+ } catch (InvocationTargetException e) {
+ throw new IllegalStateException(e);
+ } catch (NoSuchMethodException e) {
+ throw new IllegalStateException(e);
+ }
+
+ int cnt = 1;
+ StringBuilder buff = new StringBuilder();
+ for (Node tree : trees) {
+ buff.append("Tree[").append(cnt).append("]:");
+ buff.append(TreeVisualizer.toString(tree, dataset, attrNames));
+ buff.append('\n');
+ cnt++;
+ }
+ return buff.toString();
+ }
+
+ /**
+ * Decision Forest to String
+ * @param forestPath
+ * path to the Decision Forest
+ * @param datasetPath
+ * dataset path
+ * @param attrNames
+ * attribute names
+ */
+ public static String toString(String forestPath, String datasetPath, String[] attrNames) throws IOException {
+ Configuration conf = new Configuration();
+ DecisionForest forest = DecisionForest.load(conf, new Path(forestPath));
+ Dataset dataset = Dataset.load(conf, new Path(datasetPath));
+ return toString(forest, dataset, attrNames);
+ }
+
+ /**
+ * Print Decision Forest
+ * @param forestPath
+ * path to the Decision Forest
+ * @param datasetPath
+ * dataset path
+ * @param attrNames
+ * attribute names
+ */
+ public static void print(String forestPath, String datasetPath, String[] attrNames) throws IOException {
+ System.out.println(toString(forestPath, datasetPath, attrNames));
+ }
+
+ public static void main(String[] args) {
+ DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
+ ArgumentBuilder abuilder = new ArgumentBuilder();
+ GroupBuilder gbuilder = new GroupBuilder();
+
+ Option datasetOpt = obuilder.withLongName("dataset").withShortName("ds").withRequired(true)
+ .withArgument(abuilder.withName("dataset").withMinimum(1).withMaximum(1).create())
+ .withDescription("Dataset path").create();
+
+ Option modelOpt = obuilder.withLongName("model").withShortName("m").withRequired(true)
+ .withArgument(abuilder.withName("path").withMinimum(1).withMaximum(1).create())
+ .withDescription("Path to the Decision Forest").create();
+
+ Option attrNamesOpt = obuilder.withLongName("names").withShortName("n").withRequired(false)
+ .withArgument(abuilder.withName("names").withMinimum(1).create())
+ .withDescription("Optional, Attribute names").create();
+
+ Option helpOpt = obuilder.withLongName("help").withShortName("h")
+ .withDescription("Print out help").create();
+
+ Group group = gbuilder.withName("Options").withOption(datasetOpt).withOption(modelOpt)
+ .withOption(attrNamesOpt).withOption(helpOpt).create();
+
+ try {
+ Parser parser = new Parser();
+ parser.setGroup(group);
+ CommandLine cmdLine = parser.parse(args);
+
+ if (cmdLine.hasOption("help")) {
+ CommandLineUtil.printHelp(group);
+ return;
+ }
+
+ String datasetName = cmdLine.getValue(datasetOpt).toString();
+ String modelName = cmdLine.getValue(modelOpt).toString();
+ String[] attrNames = null;
+ if (cmdLine.hasOption(attrNamesOpt)) {
+ Collection<String> names = (Collection<String>) cmdLine.getValues(attrNamesOpt);
+ if (!names.isEmpty()) {
+ attrNames = new String[names.size()];
+ names.toArray(attrNames);
+ }
+ }
+
+ print(modelName, datasetName, attrNames);
+ } catch (Exception e) {
+ log.error("Exception", e);
+ CommandLineUtil.printHelp(group);
+ }
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/tools/Frequencies.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/tools/Frequencies.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/tools/Frequencies.java
new file mode 100644
index 0000000..c37af4e
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/tools/Frequencies.java
@@ -0,0 +1,122 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.tools;
+
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.OptionException;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.commandline.Parser;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.common.CommandLineUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.Arrays;
+
+/**
+ * Compute the frequency distribution of the "class label"<br>
+ * This class can be used when the criterion variable is the categorical attribute.
+ */
+@Deprecated
+public final class Frequencies extends Configured implements Tool {
+
+ private static final Logger log = LoggerFactory.getLogger(Frequencies.class);
+
+ private Frequencies() { }
+
+ @Override
+ public int run(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
+
+ DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
+ ArgumentBuilder abuilder = new ArgumentBuilder();
+ GroupBuilder gbuilder = new GroupBuilder();
+
+ Option dataOpt = obuilder.withLongName("data").withShortName("d").withRequired(true).withArgument(
+ abuilder.withName("path").withMinimum(1).withMaximum(1).create()).withDescription("Data path").create();
+
+ Option datasetOpt = obuilder.withLongName("dataset").withShortName("ds").withRequired(true).withArgument(
+ abuilder.withName("path").withMinimum(1).create()).withDescription("dataset path").create();
+
+ Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
+ .create();
+
+ Group group = gbuilder.withName("Options").withOption(dataOpt).withOption(datasetOpt).withOption(helpOpt)
+ .create();
+
+ try {
+ Parser parser = new Parser();
+ parser.setGroup(group);
+ CommandLine cmdLine = parser.parse(args);
+
+ if (cmdLine.hasOption(helpOpt)) {
+ CommandLineUtil.printHelp(group);
+ return 0;
+ }
+
+ String dataPath = cmdLine.getValue(dataOpt).toString();
+ String datasetPath = cmdLine.getValue(datasetOpt).toString();
+
+ log.debug("Data path : {}", dataPath);
+ log.debug("Dataset path : {}", datasetPath);
+
+ runTool(dataPath, datasetPath);
+ } catch (OptionException e) {
+ log.warn(e.toString(), e);
+ CommandLineUtil.printHelp(group);
+ }
+
+ return 0;
+ }
+
+ private void runTool(String data, String dataset) throws IOException,
+ ClassNotFoundException,
+ InterruptedException {
+
+ FileSystem fs = FileSystem.get(getConf());
+ Path workingDir = fs.getWorkingDirectory();
+
+ Path dataPath = new Path(data);
+ Path datasetPath = new Path(dataset);
+
+ log.info("Computing the frequencies...");
+ FrequenciesJob job = new FrequenciesJob(new Path(workingDir, "output"), dataPath, datasetPath);
+
+ int[][] counts = job.run(getConf());
+
+ // outputing the frequencies
+ log.info("counts[partition][class]");
+ for (int[] count : counts) {
+ log.info(Arrays.toString(count));
+ }
+ }
+
+ public static void main(String[] args) throws Exception {
+ ToolRunner.run(new Configuration(), new Frequencies(), args);
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/tools/FrequenciesJob.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/tools/FrequenciesJob.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/tools/FrequenciesJob.java
new file mode 100644
index 0000000..9d7e2ff
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/tools/FrequenciesJob.java
@@ -0,0 +1,297 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.tools;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.filecache.DistributedCache;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.JobContext;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.mahout.classifier.df.DFUtils;
+import org.apache.mahout.classifier.df.data.DataConverter;
+import org.apache.mahout.classifier.df.data.Dataset;
+import org.apache.mahout.classifier.df.data.Instance;
+import org.apache.mahout.classifier.df.mapreduce.Builder;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileValueIterable;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.net.URI;
+import java.util.Arrays;
+
+/**
+ * Temporary class used to compute the frequency distribution of the "class attribute".<br>
+ * This class can be used when the criterion variable is the categorical attribute.
+ */
+@Deprecated
+public class FrequenciesJob {
+
+ private static final Logger log = LoggerFactory.getLogger(FrequenciesJob.class);
+
+ /** directory that will hold this job's output */
+ private final Path outputPath;
+
+ /** file that contains the serialized dataset */
+ private final Path datasetPath;
+
+ /** directory that contains the data used in the first step */
+ private final Path dataPath;
+
+ /**
+ * @param base
+ * base directory
+ * @param dataPath
+ * data used in the first step
+ */
+ public FrequenciesJob(Path base, Path dataPath, Path datasetPath) {
+ this.outputPath = new Path(base, "frequencies.output");
+ this.dataPath = dataPath;
+ this.datasetPath = datasetPath;
+ }
+
+ /**
+ * @return counts[partition][label] = num tuples from 'partition' with class == label
+ */
+ public int[][] run(Configuration conf) throws IOException, ClassNotFoundException, InterruptedException {
+
+ // check the output
+ FileSystem fs = outputPath.getFileSystem(conf);
+ if (fs.exists(outputPath)) {
+ throw new IOException("Output path already exists : " + outputPath);
+ }
+
+ // put the dataset into the DistributedCache
+ URI[] files = {datasetPath.toUri()};
+ DistributedCache.setCacheFiles(files, conf);
+
+ Job job = new Job(conf);
+ job.setJarByClass(FrequenciesJob.class);
+
+ FileInputFormat.setInputPaths(job, dataPath);
+ FileOutputFormat.setOutputPath(job, outputPath);
+
+ job.setMapOutputKeyClass(LongWritable.class);
+ job.setMapOutputValueClass(IntWritable.class);
+ job.setOutputKeyClass(LongWritable.class);
+ job.setOutputValueClass(Frequencies.class);
+
+ job.setMapperClass(FrequenciesMapper.class);
+ job.setReducerClass(FrequenciesReducer.class);
+
+ job.setInputFormatClass(TextInputFormat.class);
+ job.setOutputFormatClass(SequenceFileOutputFormat.class);
+
+ // run the job
+ boolean succeeded = job.waitForCompletion(true);
+ if (!succeeded) {
+ throw new IllegalStateException("Job failed!");
+ }
+
+ int[][] counts = parseOutput(job);
+
+ HadoopUtil.delete(conf, outputPath);
+
+ return counts;
+ }
+
+ /**
+ * Extracts the output and processes it
+ *
+ * @return counts[partition][label] = num tuples from 'partition' with class == label
+ */
+ int[][] parseOutput(JobContext job) throws IOException {
+ Configuration conf = job.getConfiguration();
+
+ int numMaps = conf.getInt("mapred.map.tasks", -1);
+ log.info("mapred.map.tasks = {}", numMaps);
+
+ FileSystem fs = outputPath.getFileSystem(conf);
+
+ Path[] outfiles = DFUtils.listOutputFiles(fs, outputPath);
+
+ Frequencies[] values = new Frequencies[numMaps];
+
+ // read all the outputs
+ int index = 0;
+ for (Path path : outfiles) {
+ for (Frequencies value : new SequenceFileValueIterable<Frequencies>(path, conf)) {
+ values[index++] = value;
+ }
+ }
+
+ if (index < numMaps) {
+ throw new IllegalStateException("number of output Frequencies (" + index
+ + ") is lesser than the number of mappers!");
+ }
+
+ // sort the frequencies using the firstIds
+ Arrays.sort(values);
+ return Frequencies.extractCounts(values);
+ }
+
+ /**
+ * Outputs the first key and the label of each tuple
+ *
+ */
+ private static class FrequenciesMapper extends Mapper<LongWritable,Text,LongWritable,IntWritable> {
+
+ private LongWritable firstId;
+
+ private DataConverter converter;
+ private Dataset dataset;
+
+ @Override
+ protected void setup(Context context) throws IOException, InterruptedException {
+ Configuration conf = context.getConfiguration();
+
+ dataset = Builder.loadDataset(conf);
+ setup(dataset);
+ }
+
+ /**
+ * Useful when testing
+ */
+ void setup(Dataset dataset) {
+ converter = new DataConverter(dataset);
+ }
+
+ @Override
+ protected void map(LongWritable key, Text value, Context context) throws IOException,
+ InterruptedException {
+ if (firstId == null) {
+ firstId = new LongWritable(key.get());
+ }
+
+ Instance instance = converter.convert(value.toString());
+
+ context.write(firstId, new IntWritable((int) dataset.getLabel(instance)));
+ }
+
+ }
+
+ private static class FrequenciesReducer extends Reducer<LongWritable,IntWritable,LongWritable,Frequencies> {
+
+ private int nblabels;
+
+ @Override
+ protected void setup(Context context) throws IOException, InterruptedException {
+ Configuration conf = context.getConfiguration();
+ Dataset dataset = Builder.loadDataset(conf);
+ setup(dataset.nblabels());
+ }
+
+ /**
+ * Useful when testing
+ */
+ void setup(int nblabels) {
+ this.nblabels = nblabels;
+ }
+
+ @Override
+ protected void reduce(LongWritable key, Iterable<IntWritable> values, Context context)
+ throws IOException, InterruptedException {
+ int[] counts = new int[nblabels];
+ for (IntWritable value : values) {
+ counts[value.get()]++;
+ }
+
+ context.write(key, new Frequencies(key.get(), counts));
+ }
+ }
+
+ /**
+ * Output of the job
+ *
+ */
+ private static class Frequencies implements Writable, Comparable<Frequencies>, Cloneable {
+
+ /** first key of the partition used to sort the partitions */
+ private long firstId;
+
+ /** counts[c] = num tuples from the partition with label == c */
+ private int[] counts;
+
+ Frequencies() { }
+
+ Frequencies(long firstId, int[] counts) {
+ this.firstId = firstId;
+ this.counts = Arrays.copyOf(counts, counts.length);
+ }
+
+ @Override
+ public void readFields(DataInput in) throws IOException {
+ firstId = in.readLong();
+ counts = DFUtils.readIntArray(in);
+ }
+
+ @Override
+ public void write(DataOutput out) throws IOException {
+ out.writeLong(firstId);
+ DFUtils.writeArray(out, counts);
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ return other instanceof Frequencies && firstId == ((Frequencies) other).firstId;
+ }
+
+ @Override
+ public int hashCode() {
+ return (int) firstId;
+ }
+
+ @Override
+ protected Frequencies clone() {
+ return new Frequencies(firstId, counts);
+ }
+
+ @Override
+ public int compareTo(Frequencies obj) {
+ if (firstId < obj.firstId) {
+ return -1;
+ } else if (firstId > obj.firstId) {
+ return 1;
+ } else {
+ return 0;
+ }
+ }
+
+ public static int[][] extractCounts(Frequencies[] partitions) {
+ int[][] counts = new int[partitions.length][];
+ for (int p = 0; p < partitions.length; p++) {
+ counts[p] = partitions[p].counts;
+ }
+ return counts;
+ }
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/tools/TreeVisualizer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/tools/TreeVisualizer.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/tools/TreeVisualizer.java
new file mode 100644
index 0000000..a2a3458
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/tools/TreeVisualizer.java
@@ -0,0 +1,264 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.tools;
+
+import java.lang.reflect.Field;
+import java.text.DecimalFormat;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.commons.lang3.ArrayUtils;
+import org.apache.mahout.classifier.df.data.Data;
+import org.apache.mahout.classifier.df.data.Dataset;
+import org.apache.mahout.classifier.df.data.Instance;
+import org.apache.mahout.classifier.df.node.CategoricalNode;
+import org.apache.mahout.classifier.df.node.Leaf;
+import org.apache.mahout.classifier.df.node.Node;
+import org.apache.mahout.classifier.df.node.NumericalNode;
+
+/**
+ * This tool is to visualize the Decision tree
+ */
+@Deprecated
+public final class TreeVisualizer {
+
+ private TreeVisualizer() {}
+
+ private static String doubleToString(double value) {
+ DecimalFormat df = new DecimalFormat("0.##");
+ return df.format(value);
+ }
+
+ private static String toStringNode(Node node, Dataset dataset,
+ String[] attrNames, Map<String,Field> fields, int layer) {
+
+ StringBuilder buff = new StringBuilder();
+
+ try {
+ if (node instanceof CategoricalNode) {
+ CategoricalNode cnode = (CategoricalNode) node;
+ int attr = (Integer) fields.get("CategoricalNode.attr").get(cnode);
+ double[] values = (double[]) fields.get("CategoricalNode.values").get(cnode);
+ Node[] childs = (Node[]) fields.get("CategoricalNode.childs").get(cnode);
+ String[][] attrValues = (String[][]) fields.get("Dataset.values").get(dataset);
+ for (int i = 0; i < attrValues[attr].length; i++) {
+ int index = ArrayUtils.indexOf(values, i);
+ if (index < 0) {
+ continue;
+ }
+ buff.append('\n');
+ for (int j = 0; j < layer; j++) {
+ buff.append("| ");
+ }
+ buff.append(attrNames == null ? attr : attrNames[attr]).append(" = ")
+ .append(attrValues[attr][i]);
+ buff.append(toStringNode(childs[index], dataset, attrNames, fields, layer + 1));
+ }
+ } else if (node instanceof NumericalNode) {
+ NumericalNode nnode = (NumericalNode) node;
+ int attr = (Integer) fields.get("NumericalNode.attr").get(nnode);
+ double split = (Double) fields.get("NumericalNode.split").get(nnode);
+ Node loChild = (Node) fields.get("NumericalNode.loChild").get(nnode);
+ Node hiChild = (Node) fields.get("NumericalNode.hiChild").get(nnode);
+ buff.append('\n');
+ for (int j = 0; j < layer; j++) {
+ buff.append("| ");
+ }
+ buff.append(attrNames == null ? attr : attrNames[attr]).append(" < ")
+ .append(doubleToString(split));
+ buff.append(toStringNode(loChild, dataset, attrNames, fields, layer + 1));
+ buff.append('\n');
+ for (int j = 0; j < layer; j++) {
+ buff.append("| ");
+ }
+ buff.append(attrNames == null ? attr : attrNames[attr]).append(" >= ")
+ .append(doubleToString(split));
+ buff.append(toStringNode(hiChild, dataset, attrNames, fields, layer + 1));
+ } else if (node instanceof Leaf) {
+ Leaf leaf = (Leaf) node;
+ double label = (Double) fields.get("Leaf.label").get(leaf);
+ if (dataset.isNumerical(dataset.getLabelId())) {
+ buff.append(" : ").append(doubleToString(label));
+ } else {
+ buff.append(" : ").append(dataset.getLabelString(label));
+ }
+ }
+ } catch (IllegalAccessException iae) {
+ throw new IllegalStateException(iae);
+ }
+
+ return buff.toString();
+ }
+
+ private static Map<String,Field> getReflectMap() {
+ Map<String,Field> fields = new HashMap<>();
+
+ try {
+ Field m = CategoricalNode.class.getDeclaredField("attr");
+ m.setAccessible(true);
+ fields.put("CategoricalNode.attr", m);
+ m = CategoricalNode.class.getDeclaredField("values");
+ m.setAccessible(true);
+ fields.put("CategoricalNode.values", m);
+ m = CategoricalNode.class.getDeclaredField("childs");
+ m.setAccessible(true);
+ fields.put("CategoricalNode.childs", m);
+ m = NumericalNode.class.getDeclaredField("attr");
+ m.setAccessible(true);
+ fields.put("NumericalNode.attr", m);
+ m = NumericalNode.class.getDeclaredField("split");
+ m.setAccessible(true);
+ fields.put("NumericalNode.split", m);
+ m = NumericalNode.class.getDeclaredField("loChild");
+ m.setAccessible(true);
+ fields.put("NumericalNode.loChild", m);
+ m = NumericalNode.class.getDeclaredField("hiChild");
+ m.setAccessible(true);
+ fields.put("NumericalNode.hiChild", m);
+ m = Leaf.class.getDeclaredField("label");
+ m.setAccessible(true);
+ fields.put("Leaf.label", m);
+ m = Dataset.class.getDeclaredField("values");
+ m.setAccessible(true);
+ fields.put("Dataset.values", m);
+ } catch (NoSuchFieldException nsfe) {
+ throw new IllegalStateException(nsfe);
+ }
+
+ return fields;
+ }
+
+ /**
+ * Decision tree to String
+ *
+ * @param tree
+ * Node of tree
+ * @param attrNames
+ * attribute names
+ */
+ public static String toString(Node tree, Dataset dataset, String[] attrNames) {
+ return toStringNode(tree, dataset, attrNames, getReflectMap(), 0);
+ }
+
+ /**
+ * Print Decision tree
+ *
+ * @param tree
+ * Node of tree
+ * @param attrNames
+ * attribute names
+ */
+ public static void print(Node tree, Dataset dataset, String[] attrNames) {
+ System.out.println(toString(tree, dataset, attrNames));
+ }
+
+ private static String toStringPredict(Node node, Instance instance,
+ Dataset dataset, String[] attrNames, Map<String,Field> fields) {
+ StringBuilder buff = new StringBuilder();
+
+ try {
+ if (node instanceof CategoricalNode) {
+ CategoricalNode cnode = (CategoricalNode) node;
+ int attr = (Integer) fields.get("CategoricalNode.attr").get(cnode);
+ double[] values = (double[]) fields.get("CategoricalNode.values").get(
+ cnode);
+ Node[] childs = (Node[]) fields.get("CategoricalNode.childs")
+ .get(cnode);
+ String[][] attrValues = (String[][]) fields.get("Dataset.values").get(
+ dataset);
+
+ int index = ArrayUtils.indexOf(values, instance.get(attr));
+ if (index >= 0) {
+ buff.append(attrNames == null ? attr : attrNames[attr]).append(" = ")
+ .append(attrValues[attr][(int) instance.get(attr)]);
+ buff.append(" -> ");
+ buff.append(toStringPredict(childs[index], instance, dataset,
+ attrNames, fields));
+ }
+ } else if (node instanceof NumericalNode) {
+ NumericalNode nnode = (NumericalNode) node;
+ int attr = (Integer) fields.get("NumericalNode.attr").get(nnode);
+ double split = (Double) fields.get("NumericalNode.split").get(nnode);
+ Node loChild = (Node) fields.get("NumericalNode.loChild").get(nnode);
+ Node hiChild = (Node) fields.get("NumericalNode.hiChild").get(nnode);
+
+ if (instance.get(attr) < split) {
+ buff.append('(').append(attrNames == null ? attr : attrNames[attr])
+ .append(" = ").append(doubleToString(instance.get(attr)))
+ .append(") < ").append(doubleToString(split));
+ buff.append(" -> ");
+ buff.append(toStringPredict(loChild, instance, dataset, attrNames,
+ fields));
+ } else {
+ buff.append('(').append(attrNames == null ? attr : attrNames[attr])
+ .append(" = ").append(doubleToString(instance.get(attr)))
+ .append(") >= ").append(doubleToString(split));
+ buff.append(" -> ");
+ buff.append(toStringPredict(hiChild, instance, dataset, attrNames,
+ fields));
+ }
+ } else if (node instanceof Leaf) {
+ Leaf leaf = (Leaf) node;
+ double label = (Double) fields.get("Leaf.label").get(leaf);
+ if (dataset.isNumerical(dataset.getLabelId())) {
+ buff.append(doubleToString(label));
+ } else {
+ buff.append(dataset.getLabelString(label));
+ }
+ }
+ } catch (IllegalAccessException iae) {
+ throw new IllegalStateException(iae);
+ }
+
+ return buff.toString();
+ }
+
+ /**
+ * Predict trace to String
+ *
+ * @param tree
+ * Node of tree
+ * @param attrNames
+ * attribute names
+ */
+ public static String[] predictTrace(Node tree, Data data, String[] attrNames) {
+ Map<String,Field> reflectMap = getReflectMap();
+ String[] prediction = new String[data.size()];
+ for (int i = 0; i < data.size(); i++) {
+ prediction[i] = toStringPredict(tree, data.get(i), data.getDataset(),
+ attrNames, reflectMap);
+ }
+ return prediction;
+ }
+
+ /**
+ * Print predict trace
+ *
+ * @param tree
+ * Node of tree
+ * @param attrNames
+ * attribute names
+ */
+ public static void predictTracePrint(Node tree, Data data, String[] attrNames) {
+ Map<String,Field> reflectMap = getReflectMap();
+ for (int i = 0; i < data.size(); i++) {
+ System.out.println(toStringPredict(tree, data.get(i), data.getDataset(),
+ attrNames, reflectMap));
+ }
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/tools/UDistrib.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/tools/UDistrib.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/tools/UDistrib.java
new file mode 100644
index 0000000..e1b55ab
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/tools/UDistrib.java
@@ -0,0 +1,212 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.tools;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Locale;
+import java.util.Random;
+import java.util.Scanner;
+
+import com.google.common.base.Preconditions;
+import com.google.common.io.Closeables;
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.OptionException;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.commandline.Parser;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.FileUtil;
+import org.apache.hadoop.fs.Path;
+import org.apache.mahout.classifier.df.data.DataConverter;
+import org.apache.mahout.classifier.df.data.Dataset;
+import org.apache.mahout.classifier.df.data.Instance;
+import org.apache.mahout.common.CommandLineUtil;
+import org.apache.mahout.common.RandomUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This tool is used to uniformly distribute the class of all the tuples of the dataset over a given number of
+ * partitions.<br>
+ * This class can be used when the criterion variable is the categorical attribute.
+ */
+@Deprecated
+public final class UDistrib {
+
+ private static final Logger log = LoggerFactory.getLogger(UDistrib.class);
+
+ private UDistrib() {}
+
+ /**
+ * Launch the uniform distribution tool. Requires the following command line arguments:<br>
+ *
+ * data : data path dataset : dataset path numpartitions : num partitions output : output path
+ *
+ * @throws java.io.IOException
+ */
+ public static void main(String[] args) throws IOException {
+
+ DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
+ ArgumentBuilder abuilder = new ArgumentBuilder();
+ GroupBuilder gbuilder = new GroupBuilder();
+
+ Option dataOpt = obuilder.withLongName("data").withShortName("d").withRequired(true).withArgument(
+ abuilder.withName("data").withMinimum(1).withMaximum(1).create()).withDescription("Data path").create();
+
+ Option datasetOpt = obuilder.withLongName("dataset").withShortName("ds").withRequired(true).withArgument(
+ abuilder.withName("dataset").withMinimum(1).create()).withDescription("Dataset path").create();
+
+ Option outputOpt = obuilder.withLongName("output").withShortName("o").withRequired(true).withArgument(
+ abuilder.withName("output").withMinimum(1).withMaximum(1).create()).withDescription(
+ "Path to generated files").create();
+
+ Option partitionsOpt = obuilder.withLongName("numpartitions").withShortName("p").withRequired(true)
+ .withArgument(abuilder.withName("numparts").withMinimum(1).withMinimum(1).create()).withDescription(
+ "Number of partitions to create").create();
+ Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
+ .create();
+
+ Group group = gbuilder.withName("Options").withOption(dataOpt).withOption(outputOpt).withOption(
+ datasetOpt).withOption(partitionsOpt).withOption(helpOpt).create();
+
+ try {
+ Parser parser = new Parser();
+ parser.setGroup(group);
+ CommandLine cmdLine = parser.parse(args);
+
+ if (cmdLine.hasOption(helpOpt)) {
+ CommandLineUtil.printHelp(group);
+ return;
+ }
+
+ String data = cmdLine.getValue(dataOpt).toString();
+ String dataset = cmdLine.getValue(datasetOpt).toString();
+ int numPartitions = Integer.parseInt(cmdLine.getValue(partitionsOpt).toString());
+ String output = cmdLine.getValue(outputOpt).toString();
+
+ runTool(data, dataset, output, numPartitions);
+ } catch (OptionException e) {
+ log.warn(e.toString(), e);
+ CommandLineUtil.printHelp(group);
+ }
+
+ }
+
+ private static void runTool(String dataStr, String datasetStr, String output, int numPartitions) throws IOException {
+
+ Preconditions.checkArgument(numPartitions > 0, "numPartitions <= 0");
+
+ // make sure the output file does not exist
+ Path outputPath = new Path(output);
+ Configuration conf = new Configuration();
+ FileSystem fs = outputPath.getFileSystem(conf);
+
+ Preconditions.checkArgument(!fs.exists(outputPath), "Output path already exists");
+
+ // create a new file corresponding to each partition
+ // Path workingDir = fs.getWorkingDirectory();
+ // FileSystem wfs = workingDir.getFileSystem(conf);
+ // File parentFile = new File(workingDir.toString());
+ // File tempFile = FileUtil.createLocalTempFile(parentFile, "Parts", true);
+ // File tempFile = File.createTempFile("df.tools.UDistrib","");
+ // tempFile.deleteOnExit();
+ File tempFile = FileUtil.createLocalTempFile(new File(""), "df.tools.UDistrib", true);
+ Path partsPath = new Path(tempFile.toString());
+ FileSystem pfs = partsPath.getFileSystem(conf);
+
+ Path[] partPaths = new Path[numPartitions];
+ FSDataOutputStream[] files = new FSDataOutputStream[numPartitions];
+ for (int p = 0; p < numPartitions; p++) {
+ partPaths[p] = new Path(partsPath, String.format(Locale.ENGLISH, "part.%03d", p));
+ files[p] = pfs.create(partPaths[p]);
+ }
+
+ Path datasetPath = new Path(datasetStr);
+ Dataset dataset = Dataset.load(conf, datasetPath);
+
+ // currents[label] = next partition file where to place the tuple
+ int[] currents = new int[dataset.nblabels()];
+
+ // currents is initialized randomly in the range [0, numpartitions[
+ Random random = RandomUtils.getRandom();
+ for (int c = 0; c < currents.length; c++) {
+ currents[c] = random.nextInt(numPartitions);
+ }
+
+ // foreach tuple of the data
+ Path dataPath = new Path(dataStr);
+ FileSystem ifs = dataPath.getFileSystem(conf);
+ FSDataInputStream input = ifs.open(dataPath);
+ Scanner scanner = new Scanner(input, "UTF-8");
+ DataConverter converter = new DataConverter(dataset);
+
+ int id = 0;
+ while (scanner.hasNextLine()) {
+ if (id % 1000 == 0) {
+ log.info("progress : {}", id);
+ }
+
+ String line = scanner.nextLine();
+ if (line.isEmpty()) {
+ continue; // skip empty lines
+ }
+
+ // write the tuple in files[tuple.label]
+ Instance instance = converter.convert(line);
+ int label = (int) dataset.getLabel(instance);
+ files[currents[label]].writeBytes(line);
+ files[currents[label]].writeChar('\n');
+
+ // update currents
+ currents[label]++;
+ if (currents[label] == numPartitions) {
+ currents[label] = 0;
+ }
+ }
+
+ // close all the files.
+ scanner.close();
+ for (FSDataOutputStream file : files) {
+ Closeables.close(file, false);
+ }
+
+ // merge all output files
+ FileUtil.copyMerge(pfs, partsPath, fs, outputPath, true, conf, null);
+ /*
+ * FSDataOutputStream joined = fs.create(new Path(outputPath, "uniform.data")); for (int p = 0; p <
+ * numPartitions; p++) {log.info("Joining part : {}", p); FSDataInputStream partStream =
+ * fs.open(partPaths[p]);
+ *
+ * IOUtils.copyBytes(partStream, joined, conf, false);
+ *
+ * partStream.close(); }
+ *
+ * joined.close();
+ *
+ * fs.delete(partsPath, true);
+ */
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/evaluation/Auc.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/evaluation/Auc.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/evaluation/Auc.java
new file mode 100644
index 0000000..049f9bf
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/evaluation/Auc.java
@@ -0,0 +1,233 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.evaluation;
+
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.math.DenseMatrix;
+import org.apache.mahout.math.Matrix;
+import org.apache.mahout.math.list.DoubleArrayList;
+
+import com.google.common.base.Preconditions;
+
+import java.util.Random;
+
+/**
+ * Computes AUC and a few other accuracy statistics without storing huge amounts of data. This is
+ * done by keeping uniform samples of the positive and negative scores. Then, when AUC is to be
+ * computed, the remaining scores are sorted and a rank-sum statistic is used to compute the AUC.
+ * Since AUC is invariant with respect to down-sampling of either positives or negatives, this is
+ * close to correct and is exactly correct if maxBufferSize or fewer positive and negative scores
+ * are examined.
+ */
+public class Auc {
+
+ private int maxBufferSize = 10000;
+ private final DoubleArrayList[] scores = {new DoubleArrayList(), new DoubleArrayList()};
+ private final Random rand;
+ private int samples;
+ private final double threshold;
+ private final Matrix confusion;
+ private final DenseMatrix entropy;
+
+ private boolean probabilityScore = true;
+
+ private boolean hasScore;
+
+ /**
+ * Allocates a new data-structure for accumulating information about AUC and a few other accuracy
+ * measures.
+ * @param threshold The threshold to use in computing the confusion matrix.
+ */
+ public Auc(double threshold) {
+ confusion = new DenseMatrix(2, 2);
+ entropy = new DenseMatrix(2, 2);
+ this.rand = RandomUtils.getRandom();
+ this.threshold = threshold;
+ }
+
+ public Auc() {
+ this(0.5);
+ }
+
+ /**
+ * Adds a score to the AUC buffers.
+ *
+ * @param trueValue Whether this score is for a true-positive or a true-negative example.
+ * @param score The score for this example.
+ */
+ public void add(int trueValue, double score) {
+ Preconditions.checkArgument(trueValue == 0 || trueValue == 1, "True value must be 0 or 1");
+ hasScore = true;
+
+ int predictedClass = score > threshold ? 1 : 0;
+ confusion.set(trueValue, predictedClass, confusion.get(trueValue, predictedClass) + 1);
+
+ samples++;
+ if (isProbabilityScore()) {
+ double limited = Math.max(1.0e-20, Math.min(score, 1 - 1.0e-20));
+ double v0 = entropy.get(trueValue, 0);
+ entropy.set(trueValue, 0, (Math.log1p(-limited) - v0) / samples + v0);
+
+ double v1 = entropy.get(trueValue, 1);
+ entropy.set(trueValue, 1, (Math.log(limited) - v1) / samples + v1);
+ }
+
+ // add to buffers
+ DoubleArrayList buf = scores[trueValue];
+ if (buf.size() >= maxBufferSize) {
+ // but if too many points are seen, we insert into a random
+ // place and discard the predecessor. The random place could
+ // be anywhere, possibly not even in the buffer.
+ // this is a special case of Knuth's permutation algorithm
+ // but since we don't ever shuffle the first maxBufferSize
+ // samples, the result isn't just a fair sample of the prefixes
+ // of all permutations. The CONTENTs of the result, however,
+ // will be a fair and uniform sample of maxBufferSize elements
+ // chosen from all elements without replacement
+ int index = rand.nextInt(samples);
+ if (index < buf.size()) {
+ buf.set(index, score);
+ }
+ } else {
+ // for small buffers, we collect all points without permuting
+ // since we sort the data later, permuting now would just be
+ // pedantic
+ buf.add(score);
+ }
+ }
+
+ public void add(int trueValue, int predictedClass) {
+ hasScore = false;
+ Preconditions.checkArgument(trueValue == 0 || trueValue == 1, "True value must be 0 or 1");
+ confusion.set(trueValue, predictedClass, confusion.get(trueValue, predictedClass) + 1);
+ }
+
+ /**
+ * Computes the AUC of points seen so far. This can be moderately expensive since it requires
+ * that all points that have been retained be sorted.
+ *
+ * @return The value of the Area Under the receiver operating Curve.
+ */
+ public double auc() {
+ Preconditions.checkArgument(hasScore, "Can't compute AUC for classifier without a score");
+ scores[0].sort();
+ scores[1].sort();
+
+ double n0 = scores[0].size();
+ double n1 = scores[1].size();
+
+ if (n0 == 0 || n1 == 0) {
+ return 0.5;
+ }
+
+ // scan the data
+ int i0 = 0;
+ int i1 = 0;
+ int rank = 1;
+ double rankSum = 0;
+ while (i0 < n0 && i1 < n1) {
+
+ double v0 = scores[0].get(i0);
+ double v1 = scores[1].get(i1);
+
+ if (v0 < v1) {
+ i0++;
+ rank++;
+ } else if (v1 < v0) {
+ i1++;
+ rankSum += rank;
+ rank++;
+ } else {
+ // ties have to be handled delicately
+ double tieScore = v0;
+
+ // how many negatives are tied?
+ int k0 = 0;
+ while (i0 < n0 && scores[0].get(i0) == tieScore) {
+ k0++;
+ i0++;
+ }
+
+ // and how many positives
+ int k1 = 0;
+ while (i1 < n1 && scores[1].get(i1) == tieScore) {
+ k1++;
+ i1++;
+ }
+
+ // we found k0 + k1 tied values which have
+ // ranks in the half open interval [rank, rank + k0 + k1)
+ // the average rank is assigned to all
+ rankSum += (rank + (k0 + k1 - 1) / 2.0) * k1;
+ rank += k0 + k1;
+ }
+ }
+
+ if (i1 < n1) {
+ rankSum += (rank + (n1 - i1 - 1) / 2.0) * (n1 - i1);
+ rank += (int) (n1 - i1);
+ }
+
+ return (rankSum / n1 - (n1 + 1) / 2) / n0;
+ }
+
+ /**
+ * Returns the confusion matrix for the classifier supposing that we were to use a particular
+ * threshold.
+ * @return The confusion matrix.
+ */
+ public Matrix confusion() {
+ return confusion;
+ }
+
+ /**
+ * Returns a matrix related to the confusion matrix and to the log-likelihood. For a
+ * pretty accurate classifier, N + entropy is nearly the same as the confusion matrix
+ * because log(1-eps) \approx -eps if eps is small.
+ *
+ * For lower accuracy classifiers, this measure will give us a better picture of how
+ * things work our.
+ *
+ * Also, by definition, log-likelihood = sum(diag(entropy))
+ * @return Returns a cell by cell break-down of the log-likelihood
+ */
+ public Matrix entropy() {
+ if (!hasScore) {
+ // find a constant score that would optimize log-likelihood, but use a dash of Bayesian
+ // conservatism to avoid dividing by zero or taking log(0)
+ double p = (0.5 + confusion.get(1, 1)) / (1 + confusion.get(0, 0) + confusion.get(1, 1));
+ entropy.set(0, 0, confusion.get(0, 0) * Math.log1p(-p));
+ entropy.set(0, 1, confusion.get(0, 1) * Math.log(p));
+ entropy.set(1, 0, confusion.get(1, 0) * Math.log1p(-p));
+ entropy.set(1, 1, confusion.get(1, 1) * Math.log(p));
+ }
+ return entropy;
+ }
+
+ public void setMaxBufferSize(int maxBufferSize) {
+ this.maxBufferSize = maxBufferSize;
+ }
+
+ public boolean isProbabilityScore() {
+ return probabilityScore;
+ }
+
+ public void setProbabilityScore(boolean probabilityScore) {
+ this.probabilityScore = probabilityScore;
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/AbstractNaiveBayesClassifier.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/AbstractNaiveBayesClassifier.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/AbstractNaiveBayesClassifier.java
new file mode 100644
index 0000000..f0794b3
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/AbstractNaiveBayesClassifier.java
@@ -0,0 +1,82 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.naivebayes;
+
+import org.apache.mahout.classifier.AbstractVectorClassifier;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.Vector.Element;
+
+/**
+ * Class implementing the Naive Bayes Classifier Algorithm. Note that this class
+ * supports {@link #classifyFull}, but not {@code classify} or
+ * {@code classifyScalar}. The reason that these two methods are not
+ * supported is because the scores computed by a NaiveBayesClassifier do not
+ * represent probabilities.
+ */
+public abstract class AbstractNaiveBayesClassifier extends AbstractVectorClassifier {
+
+ private final NaiveBayesModel model;
+
+ protected AbstractNaiveBayesClassifier(NaiveBayesModel model) {
+ this.model = model;
+ }
+
+ protected NaiveBayesModel getModel() {
+ return model;
+ }
+
+ protected abstract double getScoreForLabelFeature(int label, int feature);
+
+ protected double getScoreForLabelInstance(int label, Vector instance) {
+ double result = 0.0;
+ for (Element e : instance.nonZeroes()) {
+ result += e.get() * getScoreForLabelFeature(label, e.index());
+ }
+ return result;
+ }
+
+ @Override
+ public int numCategories() {
+ return model.numLabels();
+ }
+
+ @Override
+ public Vector classifyFull(Vector instance) {
+ return classifyFull(model.createScoringVector(), instance);
+ }
+
+ @Override
+ public Vector classifyFull(Vector r, Vector instance) {
+ for (int label = 0; label < model.numLabels(); label++) {
+ r.setQuick(label, getScoreForLabelInstance(label, instance));
+ }
+ return r;
+ }
+
+ /** Unsupported method. This implementation simply throws an {@link UnsupportedOperationException}. */
+ @Override
+ public double classifyScalar(Vector instance) {
+ throw new UnsupportedOperationException("Not supported in Naive Bayes");
+ }
+
+ /** Unsupported method. This implementation simply throws an {@link UnsupportedOperationException}. */
+ @Override
+ public Vector classify(Vector instance) {
+ throw new UnsupportedOperationException("probabilites not supported in Naive Bayes");
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/BayesUtils.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/BayesUtils.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/BayesUtils.java
new file mode 100644
index 0000000..4db8b17
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/BayesUtils.java
@@ -0,0 +1,161 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.naivebayes;
+
+import com.google.common.base.Preconditions;
+import java.io.IOException;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.regex.Pattern;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.classifier.naivebayes.training.ThetaMapper;
+import org.apache.mahout.classifier.naivebayes.training.TrainNaiveBayesJob;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
+import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable;
+import org.apache.mahout.math.Matrix;
+import org.apache.mahout.math.SparseMatrix;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+import org.apache.mahout.math.map.OpenObjectIntHashMap;
+
+public final class BayesUtils {
+
+ private static final Pattern SLASH = Pattern.compile("/");
+
+ private BayesUtils() {}
+
+ public static NaiveBayesModel readModelFromDir(Path base, Configuration conf) {
+
+ float alphaI = conf.getFloat(ThetaMapper.ALPHA_I, 1.0f);
+ boolean isComplementary = conf.getBoolean(NaiveBayesModel.COMPLEMENTARY_MODEL, true);
+
+ // read feature sums and label sums
+ Vector scoresPerLabel = null;
+ Vector scoresPerFeature = null;
+ for (Pair<Text,VectorWritable> record : new SequenceFileDirIterable<Text, VectorWritable>(
+ new Path(base, TrainNaiveBayesJob.WEIGHTS), PathType.LIST, PathFilters.partFilter(), conf)) {
+ String key = record.getFirst().toString();
+ VectorWritable value = record.getSecond();
+ if (key.equals(TrainNaiveBayesJob.WEIGHTS_PER_FEATURE)) {
+ scoresPerFeature = value.get();
+ } else if (key.equals(TrainNaiveBayesJob.WEIGHTS_PER_LABEL)) {
+ scoresPerLabel = value.get();
+ }
+ }
+
+ Preconditions.checkNotNull(scoresPerFeature);
+ Preconditions.checkNotNull(scoresPerLabel);
+
+ Matrix scoresPerLabelAndFeature = new SparseMatrix(scoresPerLabel.size(), scoresPerFeature.size());
+ for (Pair<IntWritable,VectorWritable> entry : new SequenceFileDirIterable<IntWritable,VectorWritable>(
+ new Path(base, TrainNaiveBayesJob.SUMMED_OBSERVATIONS), PathType.LIST, PathFilters.partFilter(), conf)) {
+ scoresPerLabelAndFeature.assignRow(entry.getFirst().get(), entry.getSecond().get());
+ }
+
+ // perLabelThetaNormalizer is only used by the complementary model, we do not instantiate it for the standard model
+ Vector perLabelThetaNormalizer = null;
+ if (isComplementary) {
+ perLabelThetaNormalizer=scoresPerLabel.like();
+ for (Pair<Text,VectorWritable> entry : new SequenceFileDirIterable<Text,VectorWritable>(
+ new Path(base, TrainNaiveBayesJob.THETAS), PathType.LIST, PathFilters.partFilter(), conf)) {
+ if (entry.getFirst().toString().equals(TrainNaiveBayesJob.LABEL_THETA_NORMALIZER)) {
+ perLabelThetaNormalizer = entry.getSecond().get();
+ }
+ }
+ Preconditions.checkNotNull(perLabelThetaNormalizer);
+ }
+
+ return new NaiveBayesModel(scoresPerLabelAndFeature, scoresPerFeature, scoresPerLabel, perLabelThetaNormalizer,
+ alphaI, isComplementary);
+ }
+
+ /** Write the list of labels into a map file */
+ public static int writeLabelIndex(Configuration conf, Iterable<String> labels, Path indexPath)
+ throws IOException {
+ FileSystem fs = FileSystem.get(indexPath.toUri(), conf);
+ int i = 0;
+ try (SequenceFile.Writer writer =
+ SequenceFile.createWriter(fs.getConf(), SequenceFile.Writer.file(indexPath),
+ SequenceFile.Writer.keyClass(Text.class), SequenceFile.Writer.valueClass(IntWritable.class))) {
+ for (String label : labels) {
+ writer.append(new Text(label), new IntWritable(i++));
+ }
+ }
+ return i;
+ }
+
+ public static int writeLabelIndex(Configuration conf, Path indexPath,
+ Iterable<Pair<Text,IntWritable>> labels) throws IOException {
+ FileSystem fs = FileSystem.get(indexPath.toUri(), conf);
+ Collection<String> seen = new HashSet<>();
+ int i = 0;
+ try (SequenceFile.Writer writer =
+ SequenceFile.createWriter(fs.getConf(), SequenceFile.Writer.file(indexPath),
+ SequenceFile.Writer.keyClass(Text.class), SequenceFile.Writer.valueClass(IntWritable.class))){
+ for (Object label : labels) {
+ String theLabel = SLASH.split(((Pair<?, ?>) label).getFirst().toString())[1];
+ if (!seen.contains(theLabel)) {
+ writer.append(new Text(theLabel), new IntWritable(i++));
+ seen.add(theLabel);
+ }
+ }
+ }
+ return i;
+ }
+
+ public static Map<Integer, String> readLabelIndex(Configuration conf, Path indexPath) {
+ Map<Integer, String> labelMap = new HashMap<>();
+ for (Pair<Text, IntWritable> pair : new SequenceFileIterable<Text, IntWritable>(indexPath, true, conf)) {
+ labelMap.put(pair.getSecond().get(), pair.getFirst().toString());
+ }
+ return labelMap;
+ }
+
+ public static OpenObjectIntHashMap<String> readIndexFromCache(Configuration conf) throws IOException {
+ OpenObjectIntHashMap<String> index = new OpenObjectIntHashMap<>();
+ for (Pair<Writable,IntWritable> entry
+ : new SequenceFileIterable<Writable,IntWritable>(HadoopUtil.getSingleCachedFile(conf), conf)) {
+ index.put(entry.getFirst().toString(), entry.getSecond().get());
+ }
+ return index;
+ }
+
+ public static Map<String,Vector> readScoresFromCache(Configuration conf) throws IOException {
+ Map<String,Vector> sumVectors = new HashMap<>();
+ for (Pair<Text,VectorWritable> entry
+ : new SequenceFileDirIterable<Text,VectorWritable>(HadoopUtil.getSingleCachedFile(conf),
+ PathType.LIST, PathFilters.partFilter(), conf)) {
+ sumVectors.put(entry.getFirst().toString(), entry.getSecond().get());
+ }
+ return sumVectors;
+ }
+
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/ComplementaryNaiveBayesClassifier.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/ComplementaryNaiveBayesClassifier.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/ComplementaryNaiveBayesClassifier.java
new file mode 100644
index 0000000..18bd3d6
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/naivebayes/ComplementaryNaiveBayesClassifier.java
@@ -0,0 +1,43 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.naivebayes;
+
+
+/** Implementation of the Naive Bayes Classifier Algorithm */
+public class ComplementaryNaiveBayesClassifier extends AbstractNaiveBayesClassifier {
+ public ComplementaryNaiveBayesClassifier(NaiveBayesModel model) {
+ super(model);
+ }
+
+ @Override
+ public double getScoreForLabelFeature(int label, int feature) {
+ NaiveBayesModel model = getModel();
+ double weight = computeWeight(model.featureWeight(feature), model.weight(label, feature),
+ model.totalWeightSum(), model.labelWeight(label), model.alphaI(), model.numFeatures());
+ // see http://people.csail.mit.edu/jrennie/papers/icml03-nb.pdf - Section 3.2, Weight Magnitude Errors
+ return weight / model.thetaNormalizer(label);
+ }
+
+ // see http://people.csail.mit.edu/jrennie/papers/icml03-nb.pdf - Section 3.1, Skewed Data bias
+ public static double computeWeight(double featureWeight, double featureLabelWeight,
+ double totalWeight, double labelWeight, double alphaI, double numFeatures) {
+ double numerator = featureWeight - featureLabelWeight + alphaI;
+ double denominator = totalWeight - labelWeight + alphaI * numFeatures;
+ return -Math.log(numerator / denominator);
+ }
+}
r***@apache.org
2018-06-28 14:54:46 UTC
Permalink
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/SVDPlusPlusFactorizer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/SVDPlusPlusFactorizer.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/SVDPlusPlusFactorizer.java
new file mode 100644
index 0000000..20446f8
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/SVDPlusPlusFactorizer.java
@@ -0,0 +1,178 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.recommender.svd;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.common.RandomUtils;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
+
+/**
+ * SVD++, an enhancement of classical matrix factorization for rating prediction.
+ * Additionally to using ratings (how did people rate?) for learning, this model also takes into account
+ * who rated what.
+ *
+ * Yehuda Koren: Factorization Meets the Neighborhood: a Multifaceted Collaborative Filtering Model, KDD 2008.
+ * http://research.yahoo.com/files/kdd08koren.pdf
+ */
+public final class SVDPlusPlusFactorizer extends RatingSGDFactorizer {
+
+ private double[][] p;
+ private double[][] y;
+ private Map<Integer, List<Integer>> itemsByUser;
+
+ public SVDPlusPlusFactorizer(DataModel dataModel, int numFeatures, int numIterations) throws TasteException {
+ this(dataModel, numFeatures, 0.01, 0.1, 0.01, numIterations, 1.0);
+ biasLearningRate = 0.7;
+ biasReg = 0.33;
+ }
+
+ public SVDPlusPlusFactorizer(DataModel dataModel, int numFeatures, double learningRate, double preventOverfitting,
+ double randomNoise, int numIterations, double learningRateDecay) throws TasteException {
+ super(dataModel, numFeatures, learningRate, preventOverfitting, randomNoise, numIterations, learningRateDecay);
+ }
+
+ @Override
+ protected void prepareTraining() throws TasteException {
+ super.prepareTraining();
+ Random random = RandomUtils.getRandom();
+
+ p = new double[dataModel.getNumUsers()][numFeatures];
+ for (int i = 0; i < p.length; i++) {
+ for (int feature = 0; feature < FEATURE_OFFSET; feature++) {
+ p[i][feature] = 0;
+ }
+ for (int feature = FEATURE_OFFSET; feature < numFeatures; feature++) {
+ p[i][feature] = random.nextGaussian() * randomNoise;
+ }
+ }
+
+ y = new double[dataModel.getNumItems()][numFeatures];
+ for (int i = 0; i < y.length; i++) {
+ for (int feature = 0; feature < FEATURE_OFFSET; feature++) {
+ y[i][feature] = 0;
+ }
+ for (int feature = FEATURE_OFFSET; feature < numFeatures; feature++) {
+ y[i][feature] = random.nextGaussian() * randomNoise;
+ }
+ }
+
+ /* get internal item IDs which we will need several times */
+ itemsByUser = new HashMap<>();
+ LongPrimitiveIterator userIDs = dataModel.getUserIDs();
+ while (userIDs.hasNext()) {
+ long userId = userIDs.nextLong();
+ int userIndex = userIndex(userId);
+ FastIDSet itemIDsFromUser = dataModel.getItemIDsFromUser(userId);
+ List<Integer> itemIndexes = new ArrayList<>(itemIDsFromUser.size());
+ itemsByUser.put(userIndex, itemIndexes);
+ for (long itemID2 : itemIDsFromUser) {
+ int i2 = itemIndex(itemID2);
+ itemIndexes.add(i2);
+ }
+ }
+ }
+
+ @Override
+ public Factorization factorize() throws TasteException {
+ prepareTraining();
+
+ super.factorize();
+
+ for (int userIndex = 0; userIndex < userVectors.length; userIndex++) {
+ for (int itemIndex : itemsByUser.get(userIndex)) {
+ for (int feature = FEATURE_OFFSET; feature < numFeatures; feature++) {
+ userVectors[userIndex][feature] += y[itemIndex][feature];
+ }
+ }
+ double denominator = Math.sqrt(itemsByUser.get(userIndex).size());
+ for (int feature = 0; feature < userVectors[userIndex].length; feature++) {
+ userVectors[userIndex][feature] =
+ (float) (userVectors[userIndex][feature] / denominator + p[userIndex][feature]);
+ }
+ }
+
+ return createFactorization(userVectors, itemVectors);
+ }
+
+
+ @Override
+ protected void updateParameters(long userID, long itemID, float rating, double currentLearningRate) {
+ int userIndex = userIndex(userID);
+ int itemIndex = itemIndex(itemID);
+
+ double[] userVector = p[userIndex];
+ double[] itemVector = itemVectors[itemIndex];
+
+ double[] pPlusY = new double[numFeatures];
+ for (int i2 : itemsByUser.get(userIndex)) {
+ for (int f = FEATURE_OFFSET; f < numFeatures; f++) {
+ pPlusY[f] += y[i2][f];
+ }
+ }
+ double denominator = Math.sqrt(itemsByUser.get(userIndex).size());
+ for (int feature = 0; feature < pPlusY.length; feature++) {
+ pPlusY[feature] = (float) (pPlusY[feature] / denominator + p[userIndex][feature]);
+ }
+
+ double prediction = predictRating(pPlusY, itemIndex);
+ double err = rating - prediction;
+ double normalized_error = err / denominator;
+
+ // adjust user bias
+ userVector[USER_BIAS_INDEX] +=
+ biasLearningRate * currentLearningRate * (err - biasReg * preventOverfitting * userVector[USER_BIAS_INDEX]);
+
+ // adjust item bias
+ itemVector[ITEM_BIAS_INDEX] +=
+ biasLearningRate * currentLearningRate * (err - biasReg * preventOverfitting * itemVector[ITEM_BIAS_INDEX]);
+
+ // adjust features
+ for (int feature = FEATURE_OFFSET; feature < numFeatures; feature++) {
+ double pF = userVector[feature];
+ double iF = itemVector[feature];
+
+ double deltaU = err * iF - preventOverfitting * pF;
+ userVector[feature] += currentLearningRate * deltaU;
+
+ double deltaI = err * pPlusY[feature] - preventOverfitting * iF;
+ itemVector[feature] += currentLearningRate * deltaI;
+
+ double commonUpdate = normalized_error * iF;
+ for (int itemIndex2 : itemsByUser.get(userIndex)) {
+ double deltaI2 = commonUpdate - preventOverfitting * y[itemIndex2][feature];
+ y[itemIndex2][feature] += learningRate * deltaI2;
+ }
+ }
+ }
+
+ private double predictRating(double[] userVector, int itemID) {
+ double sum = 0;
+ for (int feature = 0; feature < numFeatures; feature++) {
+ sum += userVector[feature] * itemVectors[itemID][feature];
+ }
+ return sum;
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/SVDPreference.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/SVDPreference.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/SVDPreference.java
new file mode 100644
index 0000000..45c54da
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/SVDPreference.java
@@ -0,0 +1,41 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.recommender.svd;
+
+import com.google.common.base.Preconditions;
+import org.apache.mahout.cf.taste.impl.model.GenericPreference;
+
+final class SVDPreference extends GenericPreference {
+
+ private double cache;
+
+ SVDPreference(long userID, long itemID, float value, double cache) {
+ super(userID, itemID, value);
+ setCache(cache);
+ }
+
+ public double getCache() {
+ return cache;
+ }
+
+ public void setCache(double value) {
+ Preconditions.checkArgument(!Double.isNaN(value), "NaN cache value");
+ this.cache = value;
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/SVDRecommender.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/SVDRecommender.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/SVDRecommender.java
new file mode 100644
index 0000000..45d4af7
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/SVDRecommender.java
@@ -0,0 +1,185 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.recommender.svd;
+
+import java.io.IOException;
+import java.util.Collection;
+import java.util.List;
+import java.util.concurrent.Callable;
+
+import com.google.common.base.Preconditions;
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.impl.common.RefreshHelper;
+import org.apache.mahout.cf.taste.impl.recommender.AbstractRecommender;
+import org.apache.mahout.cf.taste.impl.recommender.AllUnknownItemsCandidateItemsStrategy;
+import org.apache.mahout.cf.taste.impl.recommender.TopItems;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.cf.taste.recommender.CandidateItemsStrategy;
+import org.apache.mahout.cf.taste.recommender.IDRescorer;
+import org.apache.mahout.cf.taste.recommender.RecommendedItem;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * A {@link org.apache.mahout.cf.taste.recommender.Recommender} that uses matrix factorization (a projection of users
+ * and items onto a feature space)
+ */
+public final class SVDRecommender extends AbstractRecommender {
+
+ private Factorization factorization;
+ private final Factorizer factorizer;
+ private final PersistenceStrategy persistenceStrategy;
+ private final RefreshHelper refreshHelper;
+
+ private static final Logger log = LoggerFactory.getLogger(SVDRecommender.class);
+
+ public SVDRecommender(DataModel dataModel, Factorizer factorizer) throws TasteException {
+ this(dataModel, factorizer, new AllUnknownItemsCandidateItemsStrategy(), getDefaultPersistenceStrategy());
+ }
+
+ public SVDRecommender(DataModel dataModel, Factorizer factorizer, CandidateItemsStrategy candidateItemsStrategy)
+ throws TasteException {
+ this(dataModel, factorizer, candidateItemsStrategy, getDefaultPersistenceStrategy());
+ }
+
+ /**
+ * Create an SVDRecommender using a persistent store to cache factorizations. A factorization is loaded from the
+ * store if present, otherwise a new factorization is computed and saved in the store.
+ *
+ * The {@link #refresh(java.util.Collection) refresh} method recomputes the factorization and overwrites the store.
+ *
+ * @param dataModel
+ * @param factorizer
+ * @param persistenceStrategy
+ * @throws TasteException
+ * @throws IOException
+ */
+ public SVDRecommender(DataModel dataModel, Factorizer factorizer, PersistenceStrategy persistenceStrategy)
+ throws TasteException {
+ this(dataModel, factorizer, getDefaultCandidateItemsStrategy(), persistenceStrategy);
+ }
+
+ /**
+ * Create an SVDRecommender using a persistent store to cache factorizations. A factorization is loaded from the
+ * store if present, otherwise a new factorization is computed and saved in the store.
+ *
+ * The {@link #refresh(java.util.Collection) refresh} method recomputes the factorization and overwrites the store.
+ *
+ * @param dataModel
+ * @param factorizer
+ * @param candidateItemsStrategy
+ * @param persistenceStrategy
+ *
+ * @throws TasteException
+ */
+ public SVDRecommender(DataModel dataModel, Factorizer factorizer, CandidateItemsStrategy candidateItemsStrategy,
+ PersistenceStrategy persistenceStrategy) throws TasteException {
+ super(dataModel, candidateItemsStrategy);
+ this.factorizer = Preconditions.checkNotNull(factorizer);
+ this.persistenceStrategy = Preconditions.checkNotNull(persistenceStrategy);
+ try {
+ factorization = persistenceStrategy.load();
+ } catch (IOException e) {
+ throw new TasteException("Error loading factorization", e);
+ }
+
+ if (factorization == null) {
+ train();
+ }
+
+ refreshHelper = new RefreshHelper(new Callable<Object>() {
+ @Override
+ public Object call() throws TasteException {
+ train();
+ return null;
+ }
+ });
+ refreshHelper.addDependency(getDataModel());
+ refreshHelper.addDependency(factorizer);
+ refreshHelper.addDependency(candidateItemsStrategy);
+ }
+
+ static PersistenceStrategy getDefaultPersistenceStrategy() {
+ return new NoPersistenceStrategy();
+ }
+
+ private void train() throws TasteException {
+ factorization = factorizer.factorize();
+ try {
+ persistenceStrategy.maybePersist(factorization);
+ } catch (IOException e) {
+ throw new TasteException("Error persisting factorization", e);
+ }
+ }
+
+ @Override
+ public List<RecommendedItem> recommend(long userID, int howMany, IDRescorer rescorer, boolean includeKnownItems)
+ throws TasteException {
+ Preconditions.checkArgument(howMany >= 1, "howMany must be at least 1");
+ log.debug("Recommending items for user ID '{}'", userID);
+
+ PreferenceArray preferencesFromUser = getDataModel().getPreferencesFromUser(userID);
+ FastIDSet possibleItemIDs = getAllOtherItems(userID, preferencesFromUser, includeKnownItems);
+
+ List<RecommendedItem> topItems = TopItems.getTopItems(howMany, possibleItemIDs.iterator(), rescorer,
+ new Estimator(userID));
+ log.debug("Recommendations are: {}", topItems);
+
+ return topItems;
+ }
+
+ /**
+ * a preference is estimated by computing the dot-product of the user and item feature vectors
+ */
+ @Override
+ public float estimatePreference(long userID, long itemID) throws TasteException {
+ double[] userFeatures = factorization.getUserFeatures(userID);
+ double[] itemFeatures = factorization.getItemFeatures(itemID);
+ double estimate = 0;
+ for (int feature = 0; feature < userFeatures.length; feature++) {
+ estimate += userFeatures[feature] * itemFeatures[feature];
+ }
+ return (float) estimate;
+ }
+
+ private final class Estimator implements TopItems.Estimator<Long> {
+
+ private final long theUserID;
+
+ private Estimator(long theUserID) {
+ this.theUserID = theUserID;
+ }
+
+ @Override
+ public double estimate(Long itemID) throws TasteException {
+ return estimatePreference(theUserID, itemID);
+ }
+ }
+
+ /**
+ * Refresh the data model and factorization.
+ */
+ @Override
+ public void refresh(Collection<Refreshable> alreadyRefreshed) {
+ refreshHelper.refresh(alreadyRefreshed);
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/AbstractItemSimilarity.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/AbstractItemSimilarity.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/AbstractItemSimilarity.java
new file mode 100644
index 0000000..e0d6f59
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/AbstractItemSimilarity.java
@@ -0,0 +1,64 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.similarity;
+
+import com.google.common.base.Preconditions;
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+import org.apache.mahout.cf.taste.impl.common.RefreshHelper;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.similarity.ItemSimilarity;
+
+import java.util.Collection;
+
+public abstract class AbstractItemSimilarity implements ItemSimilarity {
+
+ private final DataModel dataModel;
+ private final RefreshHelper refreshHelper;
+
+ protected AbstractItemSimilarity(DataModel dataModel) {
+ Preconditions.checkArgument(dataModel != null, "dataModel is null");
+ this.dataModel = dataModel;
+ this.refreshHelper = new RefreshHelper(null);
+ refreshHelper.addDependency(this.dataModel);
+ }
+
+ protected DataModel getDataModel() {
+ return dataModel;
+ }
+
+ @Override
+ public long[] allSimilarItemIDs(long itemID) throws TasteException {
+ FastIDSet allSimilarItemIDs = new FastIDSet();
+ LongPrimitiveIterator allItemIDs = dataModel.getItemIDs();
+ while (allItemIDs.hasNext()) {
+ long possiblySimilarItemID = allItemIDs.nextLong();
+ if (!Double.isNaN(itemSimilarity(itemID, possiblySimilarItemID))) {
+ allSimilarItemIDs.add(possiblySimilarItemID);
+ }
+ }
+ return allSimilarItemIDs.toArray();
+ }
+
+ @Override
+ public void refresh(Collection<Refreshable> alreadyRefreshed) {
+ refreshHelper.refresh(alreadyRefreshed);
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/AbstractSimilarity.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/AbstractSimilarity.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/AbstractSimilarity.java
new file mode 100644
index 0000000..59c30d9
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/AbstractSimilarity.java
@@ -0,0 +1,343 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.similarity;
+
+import java.util.Collection;
+import java.util.concurrent.Callable;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.common.Weighting;
+import org.apache.mahout.cf.taste.impl.common.RefreshHelper;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.cf.taste.similarity.PreferenceInferrer;
+import org.apache.mahout.cf.taste.similarity.UserSimilarity;
+
+import com.google.common.base.Preconditions;
+
+/** Abstract superclass encapsulating functionality that is common to most implementations in this package. */
+abstract class AbstractSimilarity extends AbstractItemSimilarity implements UserSimilarity {
+
+ private PreferenceInferrer inferrer;
+ private final boolean weighted;
+ private final boolean centerData;
+ private int cachedNumItems;
+ private int cachedNumUsers;
+ private final RefreshHelper refreshHelper;
+
+ /**
+ * <p>
+ * Creates a possibly weighted {@link AbstractSimilarity}.
+ * </p>
+ */
+ AbstractSimilarity(final DataModel dataModel, Weighting weighting, boolean centerData) throws TasteException {
+ super(dataModel);
+ this.weighted = weighting == Weighting.WEIGHTED;
+ this.centerData = centerData;
+ this.cachedNumItems = dataModel.getNumItems();
+ this.cachedNumUsers = dataModel.getNumUsers();
+ this.refreshHelper = new RefreshHelper(new Callable<Object>() {
+ @Override
+ public Object call() throws TasteException {
+ cachedNumItems = dataModel.getNumItems();
+ cachedNumUsers = dataModel.getNumUsers();
+ return null;
+ }
+ });
+ }
+
+ final PreferenceInferrer getPreferenceInferrer() {
+ return inferrer;
+ }
+
+ @Override
+ public final void setPreferenceInferrer(PreferenceInferrer inferrer) {
+ Preconditions.checkArgument(inferrer != null, "inferrer is null");
+ refreshHelper.addDependency(inferrer);
+ refreshHelper.removeDependency(this.inferrer);
+ this.inferrer = inferrer;
+ }
+
+ final boolean isWeighted() {
+ return weighted;
+ }
+
+ /**
+ * <p>
+ * Several subclasses in this package implement this method to actually compute the similarity from figures
+ * computed over users or items. Note that the computations in this class "center" the data, such that X and
+ * Y's mean are 0.
+ * </p>
+ *
+ * <p>
+ * Note that the sum of all X and Y values must then be 0. This value isn't passed down into the standard
+ * similarity computations as a result.
+ * </p>
+ *
+ * @param n
+ * total number of users or items
+ * @param sumXY
+ * sum of product of user/item preference values, over all items/users preferred by both
+ * users/items
+ * @param sumX2
+ * sum of the square of user/item preference values, over the first item/user
+ * @param sumY2
+ * sum of the square of the user/item preference values, over the second item/user
+ * @param sumXYdiff2
+ * sum of squares of differences in X and Y values
+ * @return similarity value between -1.0 and 1.0, inclusive, or {@link Double#NaN} if no similarity can be
+ * computed (e.g. when no items have been rated by both users
+ */
+ abstract double computeResult(int n, double sumXY, double sumX2, double sumY2, double sumXYdiff2);
+
+ @Override
+ public double userSimilarity(long userID1, long userID2) throws TasteException {
+ DataModel dataModel = getDataModel();
+ PreferenceArray xPrefs = dataModel.getPreferencesFromUser(userID1);
+ PreferenceArray yPrefs = dataModel.getPreferencesFromUser(userID2);
+ int xLength = xPrefs.length();
+ int yLength = yPrefs.length();
+
+ if (xLength == 0 || yLength == 0) {
+ return Double.NaN;
+ }
+
+ long xIndex = xPrefs.getItemID(0);
+ long yIndex = yPrefs.getItemID(0);
+ int xPrefIndex = 0;
+ int yPrefIndex = 0;
+
+ double sumX = 0.0;
+ double sumX2 = 0.0;
+ double sumY = 0.0;
+ double sumY2 = 0.0;
+ double sumXY = 0.0;
+ double sumXYdiff2 = 0.0;
+ int count = 0;
+
+ boolean hasInferrer = inferrer != null;
+
+ while (true) {
+ int compare = xIndex < yIndex ? -1 : xIndex > yIndex ? 1 : 0;
+ if (hasInferrer || compare == 0) {
+ double x;
+ double y;
+ if (xIndex == yIndex) {
+ // Both users expressed a preference for the item
+ x = xPrefs.getValue(xPrefIndex);
+ y = yPrefs.getValue(yPrefIndex);
+ } else {
+ // Only one user expressed a preference, but infer the other one's preference and tally
+ // as if the other user expressed that preference
+ if (compare < 0) {
+ // X has a value; infer Y's
+ x = xPrefs.getValue(xPrefIndex);
+ y = inferrer.inferPreference(userID2, xIndex);
+ } else {
+ // compare > 0
+ // Y has a value; infer X's
+ x = inferrer.inferPreference(userID1, yIndex);
+ y = yPrefs.getValue(yPrefIndex);
+ }
+ }
+ sumXY += x * y;
+ sumX += x;
+ sumX2 += x * x;
+ sumY += y;
+ sumY2 += y * y;
+ double diff = x - y;
+ sumXYdiff2 += diff * diff;
+ count++;
+ }
+ if (compare <= 0) {
+ if (++xPrefIndex >= xLength) {
+ if (hasInferrer) {
+ // Must count other Ys; pretend next X is far away
+ if (yIndex == Long.MAX_VALUE) {
+ // ... but stop if both are done!
+ break;
+ }
+ xIndex = Long.MAX_VALUE;
+ } else {
+ break;
+ }
+ } else {
+ xIndex = xPrefs.getItemID(xPrefIndex);
+ }
+ }
+ if (compare >= 0) {
+ if (++yPrefIndex >= yLength) {
+ if (hasInferrer) {
+ // Must count other Xs; pretend next Y is far away
+ if (xIndex == Long.MAX_VALUE) {
+ // ... but stop if both are done!
+ break;
+ }
+ yIndex = Long.MAX_VALUE;
+ } else {
+ break;
+ }
+ } else {
+ yIndex = yPrefs.getItemID(yPrefIndex);
+ }
+ }
+ }
+
+ // "Center" the data. If my math is correct, this'll do it.
+ double result;
+ if (centerData) {
+ double meanX = sumX / count;
+ double meanY = sumY / count;
+ // double centeredSumXY = sumXY - meanY * sumX - meanX * sumY + n * meanX * meanY;
+ double centeredSumXY = sumXY - meanY * sumX;
+ // double centeredSumX2 = sumX2 - 2.0 * meanX * sumX + n * meanX * meanX;
+ double centeredSumX2 = sumX2 - meanX * sumX;
+ // double centeredSumY2 = sumY2 - 2.0 * meanY * sumY + n * meanY * meanY;
+ double centeredSumY2 = sumY2 - meanY * sumY;
+ result = computeResult(count, centeredSumXY, centeredSumX2, centeredSumY2, sumXYdiff2);
+ } else {
+ result = computeResult(count, sumXY, sumX2, sumY2, sumXYdiff2);
+ }
+
+ if (!Double.isNaN(result)) {
+ result = normalizeWeightResult(result, count, cachedNumItems);
+ }
+ return result;
+ }
+
+ @Override
+ public final double itemSimilarity(long itemID1, long itemID2) throws TasteException {
+ DataModel dataModel = getDataModel();
+ PreferenceArray xPrefs = dataModel.getPreferencesForItem(itemID1);
+ PreferenceArray yPrefs = dataModel.getPreferencesForItem(itemID2);
+ int xLength = xPrefs.length();
+ int yLength = yPrefs.length();
+
+ if (xLength == 0 || yLength == 0) {
+ return Double.NaN;
+ }
+
+ long xIndex = xPrefs.getUserID(0);
+ long yIndex = yPrefs.getUserID(0);
+ int xPrefIndex = 0;
+ int yPrefIndex = 0;
+
+ double sumX = 0.0;
+ double sumX2 = 0.0;
+ double sumY = 0.0;
+ double sumY2 = 0.0;
+ double sumXY = 0.0;
+ double sumXYdiff2 = 0.0;
+ int count = 0;
+
+ // No, pref inferrers and transforms don't apply here. I think.
+
+ while (true) {
+ int compare = xIndex < yIndex ? -1 : xIndex > yIndex ? 1 : 0;
+ if (compare == 0) {
+ // Both users expressed a preference for the item
+ double x = xPrefs.getValue(xPrefIndex);
+ double y = yPrefs.getValue(yPrefIndex);
+ sumXY += x * y;
+ sumX += x;
+ sumX2 += x * x;
+ sumY += y;
+ sumY2 += y * y;
+ double diff = x - y;
+ sumXYdiff2 += diff * diff;
+ count++;
+ }
+ if (compare <= 0) {
+ if (++xPrefIndex == xLength) {
+ break;
+ }
+ xIndex = xPrefs.getUserID(xPrefIndex);
+ }
+ if (compare >= 0) {
+ if (++yPrefIndex == yLength) {
+ break;
+ }
+ yIndex = yPrefs.getUserID(yPrefIndex);
+ }
+ }
+
+ double result;
+ if (centerData) {
+ // See comments above on these computations
+ double n = (double) count;
+ double meanX = sumX / n;
+ double meanY = sumY / n;
+ // double centeredSumXY = sumXY - meanY * sumX - meanX * sumY + n * meanX * meanY;
+ double centeredSumXY = sumXY - meanY * sumX;
+ // double centeredSumX2 = sumX2 - 2.0 * meanX * sumX + n * meanX * meanX;
+ double centeredSumX2 = sumX2 - meanX * sumX;
+ // double centeredSumY2 = sumY2 - 2.0 * meanY * sumY + n * meanY * meanY;
+ double centeredSumY2 = sumY2 - meanY * sumY;
+ result = computeResult(count, centeredSumXY, centeredSumX2, centeredSumY2, sumXYdiff2);
+ } else {
+ result = computeResult(count, sumXY, sumX2, sumY2, sumXYdiff2);
+ }
+
+ if (!Double.isNaN(result)) {
+ result = normalizeWeightResult(result, count, cachedNumUsers);
+ }
+ return result;
+ }
+
+ @Override
+ public double[] itemSimilarities(long itemID1, long[] itemID2s) throws TasteException {
+ int length = itemID2s.length;
+ double[] result = new double[length];
+ for (int i = 0; i < length; i++) {
+ result[i] = itemSimilarity(itemID1, itemID2s[i]);
+ }
+ return result;
+ }
+
+ final double normalizeWeightResult(double result, int count, int num) {
+ double normalizedResult = result;
+ if (weighted) {
+ double scaleFactor = 1.0 - (double) count / (double) (num + 1);
+ if (normalizedResult < 0.0) {
+ normalizedResult = -1.0 + scaleFactor * (1.0 + normalizedResult);
+ } else {
+ normalizedResult = 1.0 - scaleFactor * (1.0 - normalizedResult);
+ }
+ }
+ // Make sure the result is not accidentally a little outside [-1.0, 1.0] due to rounding:
+ if (normalizedResult < -1.0) {
+ normalizedResult = -1.0;
+ } else if (normalizedResult > 1.0) {
+ normalizedResult = 1.0;
+ }
+ return normalizedResult;
+ }
+
+ @Override
+ public final void refresh(Collection<Refreshable> alreadyRefreshed) {
+ super.refresh(alreadyRefreshed);
+ refreshHelper.refresh(alreadyRefreshed);
+ }
+
+ @Override
+ public final String toString() {
+ return this.getClass().getSimpleName() + "[dataModel:" + getDataModel() + ",inferrer:" + inferrer + ']';
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/AveragingPreferenceInferrer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/AveragingPreferenceInferrer.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/AveragingPreferenceInferrer.java
new file mode 100644
index 0000000..7c655fe
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/AveragingPreferenceInferrer.java
@@ -0,0 +1,85 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.similarity;
+
+import java.util.Collection;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.Cache;
+import org.apache.mahout.cf.taste.impl.common.FullRunningAverage;
+import org.apache.mahout.cf.taste.impl.common.Retriever;
+import org.apache.mahout.cf.taste.impl.common.RunningAverage;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.cf.taste.similarity.PreferenceInferrer;
+
+/**
+ * <p>
+ * Implementations of this interface compute an inferred preference for a user and an item that the user has
+ * not expressed any preference for. This might be an average of other preferences scores from that user, for
+ * example. This technique is sometimes called "default voting".
+ * </p>
+ */
+public final class AveragingPreferenceInferrer implements PreferenceInferrer {
+
+ private static final Float ZERO = 0.0f;
+
+ private final DataModel dataModel;
+ private final Cache<Long,Float> averagePreferenceValue;
+
+ public AveragingPreferenceInferrer(DataModel dataModel) throws TasteException {
+ this.dataModel = dataModel;
+ Retriever<Long,Float> retriever = new PrefRetriever();
+ averagePreferenceValue = new Cache<>(retriever, dataModel.getNumUsers());
+ refresh(null);
+ }
+
+ @Override
+ public float inferPreference(long userID, long itemID) throws TasteException {
+ return averagePreferenceValue.get(userID);
+ }
+
+ @Override
+ public void refresh(Collection<Refreshable> alreadyRefreshed) {
+ averagePreferenceValue.clear();
+ }
+
+ private final class PrefRetriever implements Retriever<Long,Float> {
+
+ @Override
+ public Float get(Long key) throws TasteException {
+ PreferenceArray prefs = dataModel.getPreferencesFromUser(key);
+ int size = prefs.length();
+ if (size == 0) {
+ return ZERO;
+ }
+ RunningAverage average = new FullRunningAverage();
+ for (int i = 0; i < size; i++) {
+ average.addDatum(prefs.getValue(i));
+ }
+ return (float) average.getAverage();
+ }
+ }
+
+ @Override
+ public String toString() {
+ return "AveragingPreferenceInferrer";
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/CachingItemSimilarity.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/CachingItemSimilarity.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/CachingItemSimilarity.java
new file mode 100644
index 0000000..87aeae9
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/CachingItemSimilarity.java
@@ -0,0 +1,111 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.similarity;
+
+import java.util.Collection;
+import java.util.concurrent.Callable;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.Cache;
+import org.apache.mahout.cf.taste.impl.common.RefreshHelper;
+import org.apache.mahout.cf.taste.impl.common.Retriever;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.similarity.ItemSimilarity;
+import org.apache.mahout.common.LongPair;
+import com.google.common.base.Preconditions;
+
+/**
+ * Caches the results from an underlying {@link ItemSimilarity} implementation.
+ */
+public final class CachingItemSimilarity implements ItemSimilarity {
+
+ private final ItemSimilarity similarity;
+ private final Cache<LongPair,Double> similarityCache;
+ private final RefreshHelper refreshHelper;
+
+ /**
+ * Creates this on top of the given {@link ItemSimilarity}.
+ * The cache is sized according to properties of the given {@link DataModel}.
+ */
+ public CachingItemSimilarity(ItemSimilarity similarity, DataModel dataModel) throws TasteException {
+ this(similarity, dataModel.getNumItems());
+ }
+
+ /**
+ * Creates this on top of the given {@link ItemSimilarity}.
+ * The cache size is capped by the given size.
+ */
+ public CachingItemSimilarity(ItemSimilarity similarity, int maxCacheSize) {
+ Preconditions.checkArgument(similarity != null, "similarity is null");
+ this.similarity = similarity;
+ this.similarityCache = new Cache<>(new SimilarityRetriever(similarity), maxCacheSize);
+ this.refreshHelper = new RefreshHelper(new Callable<Void>() {
+ @Override
+ public Void call() {
+ similarityCache.clear();
+ return null;
+ }
+ });
+ refreshHelper.addDependency(similarity);
+ }
+
+ @Override
+ public double itemSimilarity(long itemID1, long itemID2) throws TasteException {
+ LongPair key = itemID1 < itemID2 ? new LongPair(itemID1, itemID2) : new LongPair(itemID2, itemID1);
+ return similarityCache.get(key);
+ }
+
+ @Override
+ public double[] itemSimilarities(long itemID1, long[] itemID2s) throws TasteException {
+ int length = itemID2s.length;
+ double[] result = new double[length];
+ for (int i = 0; i < length; i++) {
+ result[i] = itemSimilarity(itemID1, itemID2s[i]);
+ }
+ return result;
+ }
+
+ @Override
+ public long[] allSimilarItemIDs(long itemID) throws TasteException {
+ return similarity.allSimilarItemIDs(itemID);
+ }
+
+ @Override
+ public void refresh(Collection<Refreshable> alreadyRefreshed) {
+ refreshHelper.refresh(alreadyRefreshed);
+ }
+
+ public void clearCacheForItem(long itemID) {
+ similarityCache.removeKeysMatching(new LongPairMatchPredicate(itemID));
+ }
+
+ private static final class SimilarityRetriever implements Retriever<LongPair,Double> {
+ private final ItemSimilarity similarity;
+
+ private SimilarityRetriever(ItemSimilarity similarity) {
+ this.similarity = similarity;
+ }
+
+ @Override
+ public Double get(LongPair key) throws TasteException {
+ return similarity.itemSimilarity(key.getFirst(), key.getSecond());
+ }
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/CachingUserSimilarity.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/CachingUserSimilarity.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/CachingUserSimilarity.java
new file mode 100644
index 0000000..873568a
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/CachingUserSimilarity.java
@@ -0,0 +1,104 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.similarity;
+
+import java.util.Collection;
+import java.util.concurrent.Callable;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.Cache;
+import org.apache.mahout.cf.taste.impl.common.RefreshHelper;
+import org.apache.mahout.cf.taste.impl.common.Retriever;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.similarity.PreferenceInferrer;
+import org.apache.mahout.cf.taste.similarity.UserSimilarity;
+import org.apache.mahout.common.LongPair;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * Caches the results from an underlying {@link UserSimilarity} implementation.
+ */
+public final class CachingUserSimilarity implements UserSimilarity {
+
+ private final UserSimilarity similarity;
+ private final Cache<LongPair,Double> similarityCache;
+ private final RefreshHelper refreshHelper;
+
+ /**
+ * Creates this on top of the given {@link UserSimilarity}.
+ * The cache is sized according to properties of the given {@link DataModel}.
+ */
+ public CachingUserSimilarity(UserSimilarity similarity, DataModel dataModel) throws TasteException {
+ this(similarity, dataModel.getNumUsers());
+ }
+
+ /**
+ * Creates this on top of the given {@link UserSimilarity}.
+ * The cache size is capped by the given size.
+ */
+ public CachingUserSimilarity(UserSimilarity similarity, int maxCacheSize) {
+ Preconditions.checkArgument(similarity != null, "similarity is null");
+ this.similarity = similarity;
+ this.similarityCache = new Cache<>(new SimilarityRetriever(similarity), maxCacheSize);
+ this.refreshHelper = new RefreshHelper(new Callable<Void>() {
+ @Override
+ public Void call() {
+ similarityCache.clear();
+ return null;
+ }
+ });
+ refreshHelper.addDependency(similarity);
+ }
+
+ @Override
+ public double userSimilarity(long userID1, long userID2) throws TasteException {
+ LongPair key = userID1 < userID2 ? new LongPair(userID1, userID2) : new LongPair(userID2, userID1);
+ return similarityCache.get(key);
+ }
+
+ @Override
+ public void setPreferenceInferrer(PreferenceInferrer inferrer) {
+ similarityCache.clear();
+ similarity.setPreferenceInferrer(inferrer);
+ }
+
+ public void clearCacheForUser(long userID) {
+ similarityCache.removeKeysMatching(new LongPairMatchPredicate(userID));
+ }
+
+ @Override
+ public void refresh(Collection<Refreshable> alreadyRefreshed) {
+ refreshHelper.refresh(alreadyRefreshed);
+ }
+
+ private static final class SimilarityRetriever implements Retriever<LongPair,Double> {
+ private final UserSimilarity similarity;
+
+ private SimilarityRetriever(UserSimilarity similarity) {
+ this.similarity = similarity;
+ }
+
+ @Override
+ public Double get(LongPair key) throws TasteException {
+ return similarity.userSimilarity(key.getFirst(), key.getSecond());
+ }
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/CityBlockSimilarity.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/CityBlockSimilarity.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/CityBlockSimilarity.java
new file mode 100644
index 0000000..88fbe58
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/CityBlockSimilarity.java
@@ -0,0 +1,98 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.cf.taste.impl.similarity;
+
+import java.util.Collection;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.impl.common.RefreshHelper;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.similarity.PreferenceInferrer;
+import org.apache.mahout.cf.taste.similarity.UserSimilarity;
+
+/**
+ * Implementation of City Block distance (also known as Manhattan distance) - the absolute value of the difference of
+ * each direction is summed. The resulting unbounded distance is then mapped between 0 and 1.
+ */
+public final class CityBlockSimilarity extends AbstractItemSimilarity implements UserSimilarity {
+
+ public CityBlockSimilarity(DataModel dataModel) {
+ super(dataModel);
+ }
+
+ /**
+ * @throws UnsupportedOperationException
+ */
+ @Override
+ public void setPreferenceInferrer(PreferenceInferrer inferrer) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public void refresh(Collection<Refreshable> alreadyRefreshed) {
+ Collection<Refreshable> refreshed = RefreshHelper.buildRefreshed(alreadyRefreshed);
+ RefreshHelper.maybeRefresh(refreshed, getDataModel());
+ }
+
+ @Override
+ public double itemSimilarity(long itemID1, long itemID2) throws TasteException {
+ DataModel dataModel = getDataModel();
+ int preferring1 = dataModel.getNumUsersWithPreferenceFor(itemID1);
+ int preferring2 = dataModel.getNumUsersWithPreferenceFor(itemID2);
+ int intersection = dataModel.getNumUsersWithPreferenceFor(itemID1, itemID2);
+ return doSimilarity(preferring1, preferring2, intersection);
+ }
+
+ @Override
+ public double[] itemSimilarities(long itemID1, long[] itemID2s) throws TasteException {
+ DataModel dataModel = getDataModel();
+ int preferring1 = dataModel.getNumUsersWithPreferenceFor(itemID1);
+ double[] distance = new double[itemID2s.length];
+ for (int i = 0; i < itemID2s.length; ++i) {
+ int preferring2 = dataModel.getNumUsersWithPreferenceFor(itemID2s[i]);
+ int intersection = dataModel.getNumUsersWithPreferenceFor(itemID1, itemID2s[i]);
+ distance[i] = doSimilarity(preferring1, preferring2, intersection);
+ }
+ return distance;
+ }
+
+ @Override
+ public double userSimilarity(long userID1, long userID2) throws TasteException {
+ DataModel dataModel = getDataModel();
+ FastIDSet prefs1 = dataModel.getItemIDsFromUser(userID1);
+ FastIDSet prefs2 = dataModel.getItemIDsFromUser(userID2);
+ int prefs1Size = prefs1.size();
+ int prefs2Size = prefs2.size();
+ int intersectionSize = prefs1Size < prefs2Size ? prefs2.intersectionSize(prefs1) : prefs1.intersectionSize(prefs2);
+ return doSimilarity(prefs1Size, prefs2Size, intersectionSize);
+ }
+
+ /**
+ * Calculate City Block Distance from total non-zero values and intersections and map to a similarity value.
+ *
+ * @param pref1 number of non-zero values in left vector
+ * @param pref2 number of non-zero values in right vector
+ * @param intersection number of overlapping non-zero values
+ */
+ private static double doSimilarity(int pref1, int pref2, int intersection) {
+ int distance = pref1 + pref2 - 2 * intersection;
+ return 1.0 / (1.0 + distance);
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/EuclideanDistanceSimilarity.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/EuclideanDistanceSimilarity.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/EuclideanDistanceSimilarity.java
new file mode 100644
index 0000000..990e9ea
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/EuclideanDistanceSimilarity.java
@@ -0,0 +1,67 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.similarity;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.common.Weighting;
+import org.apache.mahout.cf.taste.model.DataModel;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * <p>
+ * An implementation of a "similarity" based on the Euclidean "distance" between two users X and Y. Thinking
+ * of items as dimensions and preferences as points along those dimensions, a distance is computed using all
+ * items (dimensions) where both users have expressed a preference for that item. This is simply the square
+ * root of the sum of the squares of differences in position (preference) along each dimension.</p>
+ *
+ * <p>The similarity could be computed as 1 / (1 + distance / sqrt(n)), so the resulting values are in the range (0,1].
+ * This would weight against pairs that overlap in more dimensions, which should indicate more similarity,
+ * since more dimensions offer more opportunities to be farther apart. Actually, it is computed as
+ * sqrt(n) / (1 + distance), where n is the number of dimensions, in order to help correct for this.
+ * sqrt(n) is chosen since randomly-chosen points have a distance that grows as sqrt(n).</p>
+ *
+ * <p>Note that this could cause a similarity to exceed 1; such values are capped at 1.</p>
+ *
+ * <p>Note that the distance isn't normalized in any way; it's not valid to compare similarities computed from
+ * different domains (different rating scales, for example). Within one domain, normalizing doesn't matter much as
+ * it doesn't change ordering.</p>
+ */
+public final class EuclideanDistanceSimilarity extends AbstractSimilarity {
+
+ /**
+ * @throws IllegalArgumentException if {@link DataModel} does not have preference values
+ */
+ public EuclideanDistanceSimilarity(DataModel dataModel) throws TasteException {
+ this(dataModel, Weighting.UNWEIGHTED);
+ }
+
+ /**
+ * @throws IllegalArgumentException if {@link DataModel} does not have preference values
+ */
+ public EuclideanDistanceSimilarity(DataModel dataModel, Weighting weighting) throws TasteException {
+ super(dataModel, weighting, false);
+ Preconditions.checkArgument(dataModel.hasPreferenceValues(), "DataModel doesn't have preference values");
+ }
+
+ @Override
+ double computeResult(int n, double sumXY, double sumX2, double sumY2, double sumXYdiff2) {
+ return 1.0 / (1.0 + Math.sqrt(sumXYdiff2) / Math.sqrt(n));
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/GenericItemSimilarity.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/GenericItemSimilarity.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/GenericItemSimilarity.java
new file mode 100644
index 0000000..d0c9b8c
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/GenericItemSimilarity.java
@@ -0,0 +1,358 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.similarity;
+
+import java.util.Collection;
+import java.util.Iterator;
+
+import com.google.common.collect.AbstractIterator;
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.impl.recommender.TopItems;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.similarity.ItemSimilarity;
+import org.apache.mahout.common.RandomUtils;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * <p>
+ * A "generic" {@link ItemSimilarity} which takes a static list of precomputed item similarities and bases its
+ * responses on that alone. The values may have been precomputed offline by another process, stored in a file,
+ * and then read and fed into an instance of this class.
+ * </p>
+ *
+ * <p>
+ * This is perhaps the best {@link ItemSimilarity} to use with
+ * {@link org.apache.mahout.cf.taste.impl.recommender.GenericItemBasedRecommender}, for now, since the point
+ * of item-based recommenders is that they can take advantage of the fact that item similarity is relatively
+ * static, can be precomputed, and then used in computation to gain a significant performance advantage.
+ * </p>
+ */
+public final class GenericItemSimilarity implements ItemSimilarity {
+
+ private static final long[] NO_IDS = new long[0];
+
+ private final FastByIDMap<FastByIDMap<Double>> similarityMaps = new FastByIDMap<>();
+ private final FastByIDMap<FastIDSet> similarItemIDsIndex = new FastByIDMap<>();
+
+ /**
+ * <p>
+ * Creates a {@link GenericItemSimilarity} from a precomputed list of {@link ItemItemSimilarity}s. Each
+ * represents the similarity between two distinct items. Since similarity is assumed to be symmetric, it is
+ * not necessary to specify similarity between item1 and item2, and item2 and item1. Both are the same. It
+ * is also not necessary to specify a similarity between any item and itself; these are assumed to be 1.0.
+ * </p>
+ *
+ * <p>
+ * Note that specifying a similarity between two items twice is not an error, but, the later value will win.
+ * </p>
+ *
+ * @param similarities
+ * set of {@link ItemItemSimilarity}s on which to base this instance
+ */
+ public GenericItemSimilarity(Iterable<ItemItemSimilarity> similarities) {
+ initSimilarityMaps(similarities.iterator());
+ }
+
+ /**
+ * <p>
+ * Like {@link #GenericItemSimilarity(Iterable)}, but will only keep the specified number of similarities
+ * from the given {@link Iterable} of similarities. It will keep those with the highest similarity -- those
+ * that are therefore most important.
+ * </p>
+ *
+ * <p>
+ * Thanks to tsmorton for suggesting this and providing part of the implementation.
+ * </p>
+ *
+ * @param similarities
+ * set of {@link ItemItemSimilarity}s on which to base this instance
+ * @param maxToKeep
+ * maximum number of similarities to keep
+ */
+ public GenericItemSimilarity(Iterable<ItemItemSimilarity> similarities, int maxToKeep) {
+ Iterable<ItemItemSimilarity> keptSimilarities =
+ TopItems.getTopItemItemSimilarities(maxToKeep, similarities.iterator());
+ initSimilarityMaps(keptSimilarities.iterator());
+ }
+
+ /**
+ * <p>
+ * Builds a list of item-item similarities given an {@link ItemSimilarity} implementation and a
+ * {@link DataModel}, rather than a list of {@link ItemItemSimilarity}s.
+ * </p>
+ *
+ * <p>
+ * It's valid to build a {@link GenericItemSimilarity} this way, but perhaps missing some of the point of an
+ * item-based recommender. Item-based recommenders use the assumption that item-item similarities are
+ * relatively fixed, and might be known already independent of user preferences. Hence it is useful to
+ * inject that information, using {@link #GenericItemSimilarity(Iterable)}.
+ * </p>
+ *
+ * @param otherSimilarity
+ * other {@link ItemSimilarity} to get similarities from
+ * @param dataModel
+ * data model to get items from
+ * @throws TasteException
+ * if an error occurs while accessing the {@link DataModel} items
+ */
+ public GenericItemSimilarity(ItemSimilarity otherSimilarity, DataModel dataModel) throws TasteException {
+ long[] itemIDs = GenericUserSimilarity.longIteratorToList(dataModel.getItemIDs());
+ initSimilarityMaps(new DataModelSimilaritiesIterator(otherSimilarity, itemIDs));
+ }
+
+ /**
+ * <p>
+ * Like {@link #GenericItemSimilarity(ItemSimilarity, DataModel)} )}, but will only keep the specified
+ * number of similarities from the given {@link DataModel}. It will keep those with the highest similarity
+ * -- those that are therefore most important.
+ * </p>
+ *
+ * <p>
+ * Thanks to tsmorton for suggesting this and providing part of the implementation.
+ * </p>
+ *
+ * @param otherSimilarity
+ * other {@link ItemSimilarity} to get similarities from
+ * @param dataModel
+ * data model to get items from
+ * @param maxToKeep
+ * maximum number of similarities to keep
+ * @throws TasteException
+ * if an error occurs while accessing the {@link DataModel} items
+ */
+ public GenericItemSimilarity(ItemSimilarity otherSimilarity,
+ DataModel dataModel,
+ int maxToKeep) throws TasteException {
+ long[] itemIDs = GenericUserSimilarity.longIteratorToList(dataModel.getItemIDs());
+ Iterator<ItemItemSimilarity> it = new DataModelSimilaritiesIterator(otherSimilarity, itemIDs);
+ Iterable<ItemItemSimilarity> keptSimilarities = TopItems.getTopItemItemSimilarities(maxToKeep, it);
+ initSimilarityMaps(keptSimilarities.iterator());
+ }
+
+ private void initSimilarityMaps(Iterator<ItemItemSimilarity> similarities) {
+ while (similarities.hasNext()) {
+ ItemItemSimilarity iic = similarities.next();
+ long similarityItemID1 = iic.getItemID1();
+ long similarityItemID2 = iic.getItemID2();
+ if (similarityItemID1 != similarityItemID2) {
+ // Order them -- first key should be the "smaller" one
+ long itemID1;
+ long itemID2;
+ if (similarityItemID1 < similarityItemID2) {
+ itemID1 = similarityItemID1;
+ itemID2 = similarityItemID2;
+ } else {
+ itemID1 = similarityItemID2;
+ itemID2 = similarityItemID1;
+ }
+ FastByIDMap<Double> map = similarityMaps.get(itemID1);
+ if (map == null) {
+ map = new FastByIDMap<>();
+ similarityMaps.put(itemID1, map);
+ }
+ map.put(itemID2, iic.getValue());
+
+ doIndex(itemID1, itemID2);
+ doIndex(itemID2, itemID1);
+ }
+ // else similarity between item and itself already assumed to be 1.0
+ }
+ }
+
+ private void doIndex(long fromItemID, long toItemID) {
+ FastIDSet similarItemIDs = similarItemIDsIndex.get(fromItemID);
+ if (similarItemIDs == null) {
+ similarItemIDs = new FastIDSet();
+ similarItemIDsIndex.put(fromItemID, similarItemIDs);
+ }
+ similarItemIDs.add(toItemID);
+ }
+
+ /**
+ * <p>
+ * Returns the similarity between two items. Note that similarity is assumed to be symmetric, that
+ * {@code itemSimilarity(item1, item2) == itemSimilarity(item2, item1)}, and that
+ * {@code itemSimilarity(item1,item1) == 1.0} for all items.
+ * </p>
+ *
+ * @param itemID1
+ * first item
+ * @param itemID2
+ * second item
+ * @return similarity between the two
+ */
+ @Override
+ public double itemSimilarity(long itemID1, long itemID2) {
+ if (itemID1 == itemID2) {
+ return 1.0;
+ }
+ long firstID;
+ long secondID;
+ if (itemID1 < itemID2) {
+ firstID = itemID1;
+ secondID = itemID2;
+ } else {
+ firstID = itemID2;
+ secondID = itemID1;
+ }
+ FastByIDMap<Double> nextMap = similarityMaps.get(firstID);
+ if (nextMap == null) {
+ return Double.NaN;
+ }
+ Double similarity = nextMap.get(secondID);
+ return similarity == null ? Double.NaN : similarity;
+ }
+
+ @Override
+ public double[] itemSimilarities(long itemID1, long[] itemID2s) {
+ int length = itemID2s.length;
+ double[] result = new double[length];
+ for (int i = 0; i < length; i++) {
+ result[i] = itemSimilarity(itemID1, itemID2s[i]);
+ }
+ return result;
+ }
+
+ @Override
+ public long[] allSimilarItemIDs(long itemID) {
+ FastIDSet similarItemIDs = similarItemIDsIndex.get(itemID);
+ return similarItemIDs != null ? similarItemIDs.toArray() : NO_IDS;
+ }
+
+ @Override
+ public void refresh(Collection<Refreshable> alreadyRefreshed) {
+ // Do nothing
+ }
+
+ /** Encapsulates a similarity between two items. Similarity must be in the range [-1.0,1.0]. */
+ public static final class ItemItemSimilarity implements Comparable<ItemItemSimilarity> {
+
+ private final long itemID1;
+ private final long itemID2;
+ private final double value;
+
+ /**
+ * @param itemID1
+ * first item
+ * @param itemID2
+ * second item
+ * @param value
+ * similarity between the two
+ * @throws IllegalArgumentException
+ * if value is NaN, less than -1.0 or greater than 1.0
+ */
+ public ItemItemSimilarity(long itemID1, long itemID2, double value) {
+ Preconditions.checkArgument(value >= -1.0 && value <= 1.0, "Illegal value: " + value + ". Must be: -1.0 <= value <= 1.0");
+ this.itemID1 = itemID1;
+ this.itemID2 = itemID2;
+ this.value = value;
+ }
+
+ public long getItemID1() {
+ return itemID1;
+ }
+
+ public long getItemID2() {
+ return itemID2;
+ }
+
+ public double getValue() {
+ return value;
+ }
+
+ @Override
+ public String toString() {
+ return "ItemItemSimilarity[" + itemID1 + ',' + itemID2 + ':' + value + ']';
+ }
+
+ /** Defines an ordering from highest similarity to lowest. */
+ @Override
+ public int compareTo(ItemItemSimilarity other) {
+ double otherValue = other.getValue();
+ return value > otherValue ? -1 : value < otherValue ? 1 : 0;
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ if (!(other instanceof ItemItemSimilarity)) {
+ return false;
+ }
+ ItemItemSimilarity otherSimilarity = (ItemItemSimilarity) other;
+ return otherSimilarity.getItemID1() == itemID1
+ && otherSimilarity.getItemID2() == itemID2
+ && otherSimilarity.getValue() == value;
+ }
+
+ @Override
+ public int hashCode() {
+ return (int) itemID1 ^ (int) itemID2 ^ RandomUtils.hashDouble(value);
+ }
+
+ }
+
+ private static final class DataModelSimilaritiesIterator extends AbstractIterator<ItemItemSimilarity> {
+
+ private final ItemSimilarity otherSimilarity;
+ private final long[] itemIDs;
+ private int i;
+ private long itemID1;
+ private int j;
+
+ private DataModelSimilaritiesIterator(ItemSimilarity otherSimilarity, long[] itemIDs) {
+ this.otherSimilarity = otherSimilarity;
+ this.itemIDs = itemIDs;
+ i = 0;
+ itemID1 = itemIDs[0];
+ j = 1;
+ }
+
+ @Override
+ protected ItemItemSimilarity computeNext() {
+ int size = itemIDs.length;
+ ItemItemSimilarity result = null;
+ while (result == null && i < size - 1) {
+ long itemID2 = itemIDs[j];
+ double similarity;
+ try {
+ similarity = otherSimilarity.itemSimilarity(itemID1, itemID2);
+ } catch (TasteException te) {
+ // ugly:
+ throw new IllegalStateException(te);
+ }
+ if (!Double.isNaN(similarity)) {
+ result = new ItemItemSimilarity(itemID1, itemID2, similarity);
+ }
+ if (++j == size) {
+ itemID1 = itemIDs[++i];
+ j = i + 1;
+ }
+ }
+ if (result == null) {
+ return endOfData();
+ } else {
+ return result;
+ }
+ }
+
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/GenericUserSimilarity.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/GenericUserSimilarity.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/GenericUserSimilarity.java
new file mode 100644
index 0000000..1c221c2
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/GenericUserSimilarity.java
@@ -0,0 +1,238 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.similarity;
+
+import java.util.Collection;
+import java.util.Iterator;
+
+import com.google.common.collect.AbstractIterator;
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+import org.apache.mahout.cf.taste.impl.recommender.TopItems;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.similarity.PreferenceInferrer;
+import org.apache.mahout.cf.taste.similarity.UserSimilarity;
+import org.apache.mahout.common.RandomUtils;
+
+import com.google.common.base.Preconditions;
+
+public final class GenericUserSimilarity implements UserSimilarity {
+
+ private final FastByIDMap<FastByIDMap<Double>> similarityMaps = new FastByIDMap<>();
+
+ public GenericUserSimilarity(Iterable<UserUserSimilarity> similarities) {
+ initSimilarityMaps(similarities.iterator());
+ }
+
+ public GenericUserSimilarity(Iterable<UserUserSimilarity> similarities, int maxToKeep) {
+ Iterable<UserUserSimilarity> keptSimilarities =
+ TopItems.getTopUserUserSimilarities(maxToKeep, similarities.iterator());
+ initSimilarityMaps(keptSimilarities.iterator());
+ }
+
+ public GenericUserSimilarity(UserSimilarity otherSimilarity, DataModel dataModel) throws TasteException {
+ long[] userIDs = longIteratorToList(dataModel.getUserIDs());
+ initSimilarityMaps(new DataModelSimilaritiesIterator(otherSimilarity, userIDs));
+ }
+
+ public GenericUserSimilarity(UserSimilarity otherSimilarity,
+ DataModel dataModel,
+ int maxToKeep) throws TasteException {
+ long[] userIDs = longIteratorToList(dataModel.getUserIDs());
+ Iterator<UserUserSimilarity> it = new DataModelSimilaritiesIterator(otherSimilarity, userIDs);
+ Iterable<UserUserSimilarity> keptSimilarities = TopItems.getTopUserUserSimilarities(maxToKeep, it);
+ initSimilarityMaps(keptSimilarities.iterator());
+ }
+
+ static long[] longIteratorToList(LongPrimitiveIterator iterator) {
+ long[] result = new long[5];
+ int size = 0;
+ while (iterator.hasNext()) {
+ if (size == result.length) {
+ long[] newResult = new long[result.length << 1];
+ System.arraycopy(result, 0, newResult, 0, result.length);
+ result = newResult;
+ }
+ result[size++] = iterator.next();
+ }
+ if (size != result.length) {
+ long[] newResult = new long[size];
+ System.arraycopy(result, 0, newResult, 0, size);
+ result = newResult;
+ }
+ return result;
+ }
+
+ private void initSimilarityMaps(Iterator<UserUserSimilarity> similarities) {
+ while (similarities.hasNext()) {
+ UserUserSimilarity uuc = similarities.next();
+ long similarityUser1 = uuc.getUserID1();
+ long similarityUser2 = uuc.getUserID2();
+ if (similarityUser1 != similarityUser2) {
+ // Order them -- first key should be the "smaller" one
+ long user1;
+ long user2;
+ if (similarityUser1 < similarityUser2) {
+ user1 = similarityUser1;
+ user2 = similarityUser2;
+ } else {
+ user1 = similarityUser2;
+ user2 = similarityUser1;
+ }
+ FastByIDMap<Double> map = similarityMaps.get(user1);
+ if (map == null) {
+ map = new FastByIDMap<>();
+ similarityMaps.put(user1, map);
+ }
+ map.put(user2, uuc.getValue());
+ }
+ // else similarity between user and itself already assumed to be 1.0
+ }
+ }
+
+ @Override
+ public double userSimilarity(long userID1, long userID2) {
+ if (userID1 == userID2) {
+ return 1.0;
+ }
+ long first;
+ long second;
+ if (userID1 < userID2) {
+ first = userID1;
+ second = userID2;
+ } else {
+ first = userID2;
+ second = userID1;
+ }
+ FastByIDMap<Double> nextMap = similarityMaps.get(first);
+ if (nextMap == null) {
+ return Double.NaN;
+ }
+ Double similarity = nextMap.get(second);
+ return similarity == null ? Double.NaN : similarity;
+ }
+
+ @Override
+ public void setPreferenceInferrer(PreferenceInferrer inferrer) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public void refresh(Collection<Refreshable> alreadyRefreshed) {
+ // Do nothing
+ }
+
+ public static final class UserUserSimilarity implements Comparable<UserUserSimilarity> {
+
+ private final long userID1;
+ private final long userID2;
+ private final double value;
+
+ public UserUserSimilarity(long userID1, long userID2, double value) {
+ Preconditions.checkArgument(value >= -1.0 && value <= 1.0, "Illegal value: " + value + ". Must be: -1.0 <= value <= 1.0");
+ this.userID1 = userID1;
+ this.userID2 = userID2;
+ this.value = value;
+ }
+
+ public long getUserID1() {
+ return userID1;
+ }
+
+ public long getUserID2() {
+ return userID2;
+ }
+
+ public double getValue() {
+ return value;
+ }
+
+ @Override
+ public String toString() {
+ return "UserUserSimilarity[" + userID1 + ',' + userID2 + ':' + value + ']';
+ }
+
+ /** Defines an ordering from highest similarity to lowest. */
+ @Override
+ public int compareTo(UserUserSimilarity other) {
+ double otherValue = other.getValue();
+ return value > otherValue ? -1 : value < otherValue ? 1 : 0;
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ if (!(other instanceof UserUserSimilarity)) {
+ return false;
+ }
+ UserUserSimilarity otherSimilarity = (UserUserSimilarity) other;
+ return otherSimilarity.getUserID1() == userID1
+ && otherSimilarity.getUserID2() == userID2
+ && otherSimilarity.getValue() == value;
+ }
+
+ @Override
+ public int hashCode() {
+ return (int) userID1 ^ (int) userID2 ^ RandomUtils.hashDouble(value);
+ }
+
+ }
+
+ private static final class DataModelSimilaritiesIterator extends AbstractIterator<UserUserSimilarity> {
+
+ private final UserSimilarity otherSimilarity;
+ private final long[] itemIDs;
+ private int i;
+ private long itemID1;
+ private int j;
+
+ private DataModelSimilaritiesIterator(UserSimilarity otherSimilarity, long[] itemIDs) {
+ this.otherSimilarity = otherSimilarity;
+ this.itemIDs = itemIDs;
+ i = 0;
+ itemID1 = itemIDs[0];
+ j = 1;
+ }
+
+ @Override
+ protected UserUserSimilarity computeNext() {
+ int size = itemIDs.length;
+ while (i < size - 1) {
+ long itemID2 = itemIDs[j];
+ double similarity;
+ try {
+ similarity = otherSimilarity.userSimilarity(itemID1, itemID2);
+ } catch (TasteException te) {
+ // ugly:
+ throw new IllegalStateException(te);
+ }
+ if (!Double.isNaN(similarity)) {
+ return new UserUserSimilarity(itemID1, itemID2, similarity);
+ }
+ if (++j == size) {
+ itemID1 = itemIDs[++i];
+ j = i + 1;
+ }
+ }
+ return endOfData();
+ }
+
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/LogLikelihoodSimilarity.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/LogLikelihoodSimilarity.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/LogLikelihoodSimilarity.java
new file mode 100644
index 0000000..3084c8f
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/LogLikelihoodSimilarity.java
@@ -0,0 +1,121 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.similarity;
+
+import java.util.Collection;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.impl.common.RefreshHelper;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.similarity.PreferenceInferrer;
+import org.apache.mahout.cf.taste.similarity.UserSimilarity;
+import org.apache.mahout.math.stats.LogLikelihood;
+
+/**
+ * See <a href="http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.14.5962">
+ * http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.14.5962</a> and
+ * <a href="http://tdunning.blogspot.com/2008/03/surprise-and-coincidence.html">
+ * http://tdunning.blogspot.com/2008/03/surprise-and-coincidence.html</a>.
+ */
+public final class LogLikelihoodSimilarity extends AbstractItemSimilarity implements UserSimilarity {
+
+ public LogLikelihoodSimilarity(DataModel dataModel) {
+ super(dataModel);
+ }
+
+ /**
+ * @throws UnsupportedOperationException
+ */
+ @Override
+ public void setPreferenceInferrer(PreferenceInferrer inferrer) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public double userSimilarity(long userID1, long userID2) throws TasteException {
+
+ DataModel dataModel = getDataModel();
+ FastIDSet prefs1 = dataModel.getItemIDsFromUser(userID1);
+ FastIDSet prefs2 = dataModel.getItemIDsFromUser(userID2);
+
+ long prefs1Size = prefs1.size();
+ long prefs2Size = prefs2.size();
+ long intersectionSize =
+ prefs1Size < prefs2Size ? prefs2.intersectionSize(prefs1) : prefs1.intersectionSize(prefs2);
+ if (intersectionSize == 0) {
+ return Double.NaN;
+ }
+ long numItems = dataModel.getNumItems();
+ double logLikelihood =
+ LogLikelihood.logLikelihoodRatio(intersectionSize,
+ prefs2Size - intersectionSize,
+ prefs1Size - intersectionSize,
+ numItems - prefs1Size - prefs2Size + intersectionSize);
+ return 1.0 - 1.0 / (1.0 + logLikelihood);
+ }
+
+ @Override
+ public double itemSimilarity(long itemID1, long itemID2) throws TasteException {
+ DataModel dataModel = getDataModel();
+ long preferring1 = dataModel.getNumUsersWithPreferenceFor(itemID1);
+ long numUsers = dataModel.getNumUsers();
+ return doItemSimilarity(itemID1, itemID2, preferring1, numUsers);
+ }
+
+ @Override
+ public double[] itemSimilarities(long itemID1, long[] itemID2s) throws TasteException {
+ DataModel dataModel = getDataModel();
+ long preferring1 = dataModel.getNumUsersWithPreferenceFor(itemID1);
+ long numUsers = dataModel.getNumUsers();
+ int length = itemID2s.length;
+ double[] result = new double[length];
+ for (int i = 0; i < length; i++) {
+ result[i] = doItemSimilarity(itemID1, itemID2s[i], preferring1, numUsers);
+ }
+ return result;
+ }
+
+ private double doItemSimilarity(long itemID1, long itemID2, long preferring1, long numUsers) throws TasteException {
+ DataModel dataModel = getDataModel();
+ long preferring1and2 = dataModel.getNumUsersWithPreferenceFor(itemID1, itemID2);
+ if (preferring1and2 == 0) {
+ return Double.NaN;
+ }
+ long preferring2 = dataModel.getNumUsersWithPreferenceFor(itemID2);
+ double logLikelihood =
+ LogLikelihood.logLikelihoodRatio(preferring1and2,
+ preferring2 - preferring1and2,
+ preferring1 - preferring1and2,
+ numUsers - preferring1 - preferring2 + preferring1and2);
+ return 1.0 - 1.0 / (1.0 + logLikelihood);
+ }
+
+ @Override
+ public void refresh(Collection<Refreshable> alreadyRefreshed) {
+ alreadyRefreshed = RefreshHelper.buildRefreshed(alreadyRefreshed);
+ RefreshHelper.maybeRefresh(alreadyRefreshed, getDataModel());
+ }
+
+ @Override
+ public String toString() {
+ return "LogLikelihoodSimilarity[dataModel:" + getDataModel() + ']';
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/LongPairMatchPredicate.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/LongPairMatchPredicate.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/LongPairMatchPredicate.java
new file mode 100644
index 0000000..48dc4e0
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/LongPairMatchPredicate.java
@@ -0,0 +1,40 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.similarity;
+
+import org.apache.mahout.cf.taste.impl.common.Cache;
+import org.apache.mahout.common.LongPair;
+
+/**
+ * A {@link Cache.MatchPredicate} which will match an ID against either element of a
+ * {@link LongPair}.
+ */
+final class LongPairMatchPredicate implements Cache.MatchPredicate<LongPair> {
+
+ private final long id;
+
+ LongPairMatchPredicate(long id) {
+ this.id = id;
+ }
+
+ @Override
+ public boolean matches(LongPair pair) {
+ return pair.getFirst() == id || pair.getSecond() == id;
+ }
+
+}
r***@apache.org
2018-06-28 14:54:48 UTC
Permalink
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/AbstractRecommender.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/AbstractRecommender.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/AbstractRecommender.java
new file mode 100644
index 0000000..3a62b08
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/AbstractRecommender.java
@@ -0,0 +1,140 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.recommender;
+
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.cf.taste.recommender.CandidateItemsStrategy;
+
+import java.util.List;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.recommender.IDRescorer;
+import org.apache.mahout.cf.taste.recommender.RecommendedItem;
+import org.apache.mahout.cf.taste.recommender.Recommender;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.google.common.base.Preconditions;
+
+public abstract class AbstractRecommender implements Recommender {
+
+ private static final Logger log = LoggerFactory.getLogger(AbstractRecommender.class);
+
+ private final DataModel dataModel;
+ private final CandidateItemsStrategy candidateItemsStrategy;
+
+ protected AbstractRecommender(DataModel dataModel, CandidateItemsStrategy candidateItemsStrategy) {
+ this.dataModel = Preconditions.checkNotNull(dataModel);
+ this.candidateItemsStrategy = Preconditions.checkNotNull(candidateItemsStrategy);
+ }
+
+ protected AbstractRecommender(DataModel dataModel) {
+ this(dataModel, getDefaultCandidateItemsStrategy());
+ }
+
+ protected static CandidateItemsStrategy getDefaultCandidateItemsStrategy() {
+ return new PreferredItemsNeighborhoodCandidateItemsStrategy();
+ }
+
+
+ /**
+ * <p>
+ * Default implementation which just calls
+ * {@link Recommender#recommend(long, int, org.apache.mahout.cf.taste.recommender.IDRescorer)}, with a
+ * {@link org.apache.mahout.cf.taste.recommender.Rescorer} that does nothing.
+ * </p>
+ */
+ @Override
+ public List<RecommendedItem> recommend(long userID, int howMany) throws TasteException {
+ return recommend(userID, howMany, null, false);
+ }
+
+ /**
+ * <p>
+ * Default implementation which just calls
+ * {@link Recommender#recommend(long, int, org.apache.mahout.cf.taste.recommender.IDRescorer)}, with a
+ * {@link org.apache.mahout.cf.taste.recommender.Rescorer} that does nothing.
+ * </p>
+ */
+ @Override
+ public List<RecommendedItem> recommend(long userID, int howMany, boolean includeKnownItems) throws TasteException {
+ return recommend(userID, howMany, null, includeKnownItems);
+ }
+
+ /**
+ * <p> Delegates to {@link Recommender#recommend(long, int, IDRescorer, boolean)}
+ */
+ @Override
+ public List<RecommendedItem> recommend(long userID, int howMany, IDRescorer rescorer) throws TasteException{
+ return recommend(userID, howMany,rescorer, false);
+ }
+
+ /**
+ * <p>
+ * Default implementation which just calls {@link DataModel#setPreference(long, long, float)}.
+ * </p>
+ *
+ * @throws IllegalArgumentException
+ * if userID or itemID is {@code null}, or if value is {@link Double#NaN}
+ */
+ @Override
+ public void setPreference(long userID, long itemID, float value) throws TasteException {
+ Preconditions.checkArgument(!Float.isNaN(value), "NaN value");
+ log.debug("Setting preference for user {}, item {}", userID, itemID);
+ dataModel.setPreference(userID, itemID, value);
+ }
+
+ /**
+ * <p>
+ * Default implementation which just calls {@link DataModel#removePreference(long, long)} (Object, Object)}.
+ * </p>
+ *
+ * @throws IllegalArgumentException
+ * if userID or itemID is {@code null}
+ */
+ @Override
+ public void removePreference(long userID, long itemID) throws TasteException {
+ log.debug("Remove preference for user '{}', item '{}'", userID, itemID);
+ dataModel.removePreference(userID, itemID);
+ }
+
+ @Override
+ public DataModel getDataModel() {
+ return dataModel;
+ }
+
+ /**
+ * @param userID
+ * ID of user being evaluated
+ * @param preferencesFromUser
+ * the preferences from the user
+ * @param includeKnownItems
+ * whether to include items already known by the user in recommendations
+ * @return all items in the {@link DataModel} for which the user has not expressed a preference and could
+ * possibly be recommended to the user
+ * @throws TasteException
+ * if an error occurs while listing items
+ */
+ protected FastIDSet getAllOtherItems(long userID, PreferenceArray preferencesFromUser, boolean includeKnownItems)
+ throws TasteException {
+ return candidateItemsStrategy.getCandidateItems(userID, preferencesFromUser, dataModel, includeKnownItems);
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/AllSimilarItemsCandidateItemsStrategy.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/AllSimilarItemsCandidateItemsStrategy.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/AllSimilarItemsCandidateItemsStrategy.java
new file mode 100644
index 0000000..37389a7
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/AllSimilarItemsCandidateItemsStrategy.java
@@ -0,0 +1,50 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.recommender;
+
+import com.google.common.base.Preconditions;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.similarity.ItemSimilarity;
+
+/**
+ * returns the result of {@link ItemSimilarity#allSimilarItemIDs(long)} as candidate items
+ */
+public class AllSimilarItemsCandidateItemsStrategy extends AbstractCandidateItemsStrategy {
+
+ private final ItemSimilarity similarity;
+
+ public AllSimilarItemsCandidateItemsStrategy(ItemSimilarity similarity) {
+ Preconditions.checkArgument(similarity != null, "similarity is null");
+ this.similarity = similarity;
+ }
+
+ @Override
+ protected FastIDSet doGetCandidateItems(long[] preferredItemIDs, DataModel dataModel, boolean includeKnownItems)
+ throws TasteException {
+ FastIDSet candidateItemIDs = new FastIDSet();
+ for (long itemID : preferredItemIDs) {
+ candidateItemIDs.addAll(similarity.allSimilarItemIDs(itemID));
+ }
+ if (!includeKnownItems) {
+ candidateItemIDs.removeAll(preferredItemIDs);
+ }
+ return candidateItemIDs;
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/AllUnknownItemsCandidateItemsStrategy.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/AllUnknownItemsCandidateItemsStrategy.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/AllUnknownItemsCandidateItemsStrategy.java
new file mode 100644
index 0000000..929eddd
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/AllUnknownItemsCandidateItemsStrategy.java
@@ -0,0 +1,41 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.recommender;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+import org.apache.mahout.cf.taste.model.DataModel;
+
+public final class AllUnknownItemsCandidateItemsStrategy extends AbstractCandidateItemsStrategy {
+
+ /** return all items the user has not yet seen */
+ @Override
+ protected FastIDSet doGetCandidateItems(long[] preferredItemIDs, DataModel dataModel, boolean includeKnownItems)
+ throws TasteException {
+ FastIDSet possibleItemIDs = new FastIDSet(dataModel.getNumItems());
+ LongPrimitiveIterator allItemIDs = dataModel.getItemIDs();
+ while (allItemIDs.hasNext()) {
+ possibleItemIDs.add(allItemIDs.nextLong());
+ }
+ if (!includeKnownItems) {
+ possibleItemIDs.removeAll(preferredItemIDs);
+ }
+ return possibleItemIDs;
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/ByRescoreComparator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/ByRescoreComparator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/ByRescoreComparator.java
new file mode 100644
index 0000000..1677ea8
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/ByRescoreComparator.java
@@ -0,0 +1,65 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.recommender;
+
+import java.io.Serializable;
+import java.util.Comparator;
+
+import org.apache.mahout.cf.taste.recommender.IDRescorer;
+import org.apache.mahout.cf.taste.recommender.RecommendedItem;
+
+/**
+ * <p>
+ * Defines ordering on {@link RecommendedItem} by the rescored value of the recommendations' estimated
+ * preference value, from high to low.
+ * </p>
+ */
+final class ByRescoreComparator implements Comparator<RecommendedItem>, Serializable {
+
+ private final IDRescorer rescorer;
+
+ ByRescoreComparator(IDRescorer rescorer) {
+ this.rescorer = rescorer;
+ }
+
+ @Override
+ public int compare(RecommendedItem o1, RecommendedItem o2) {
+ double rescored1;
+ double rescored2;
+ if (rescorer == null) {
+ rescored1 = o1.getValue();
+ rescored2 = o2.getValue();
+ } else {
+ rescored1 = rescorer.rescore(o1.getItemID(), o1.getValue());
+ rescored2 = rescorer.rescore(o2.getItemID(), o2.getValue());
+ }
+ if (rescored1 < rescored2) {
+ return 1;
+ } else if (rescored1 > rescored2) {
+ return -1;
+ } else {
+ return 0;
+ }
+ }
+
+ @Override
+ public String toString() {
+ return "ByRescoreComparator[rescorer:" + rescorer + ']';
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/ByValueRecommendedItemComparator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/ByValueRecommendedItemComparator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/ByValueRecommendedItemComparator.java
new file mode 100644
index 0000000..57c5f3d
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/ByValueRecommendedItemComparator.java
@@ -0,0 +1,43 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.recommender;
+
+import java.io.Serializable;
+import java.util.Comparator;
+
+import org.apache.mahout.cf.taste.recommender.RecommendedItem;
+
+/**
+ * Defines a natural ordering from most-preferred item (highest value) to least-preferred.
+ */
+public final class ByValueRecommendedItemComparator implements Comparator<RecommendedItem>, Serializable {
+
+ private static final Comparator<RecommendedItem> INSTANCE = new ByValueRecommendedItemComparator();
+
+ public static Comparator<RecommendedItem> getInstance() {
+ return INSTANCE;
+ }
+
+ @Override
+ public int compare(RecommendedItem o1, RecommendedItem o2) {
+ float value1 = o1.getValue();
+ float value2 = o2.getValue();
+ return value1 > value2 ? -1 : value1 < value2 ? 1 : 0;
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/CachingRecommender.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/CachingRecommender.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/CachingRecommender.java
new file mode 100644
index 0000000..7ed8cc3
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/CachingRecommender.java
@@ -0,0 +1,251 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.recommender;
+
+import java.util.Collection;
+import java.util.Collections;
+import java.util.List;
+import java.util.concurrent.Callable;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.Cache;
+import org.apache.mahout.cf.taste.impl.common.RefreshHelper;
+import org.apache.mahout.cf.taste.impl.common.Retriever;
+import org.apache.mahout.cf.taste.impl.model.PlusAnonymousUserDataModel;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.recommender.IDRescorer;
+import org.apache.mahout.cf.taste.recommender.RecommendedItem;
+import org.apache.mahout.cf.taste.recommender.Recommender;
+import org.apache.mahout.common.LongPair;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * <p>
+ * A {@link Recommender} which caches the results from another {@link Recommender} in memory.
+ *
+ * TODO: Should be checked for thread safety
+ * </p>
+ */
+public final class CachingRecommender implements Recommender {
+
+ private static final Logger log = LoggerFactory.getLogger(CachingRecommender.class);
+
+ private final Recommender recommender;
+ private final int[] maxHowMany;
+ private final Retriever<Long,Recommendations> recommendationsRetriever;
+ private final Cache<Long,Recommendations> recommendationCache;
+ private final Cache<LongPair,Float> estimatedPrefCache;
+ private final RefreshHelper refreshHelper;
+ private IDRescorer currentRescorer;
+ private boolean currentlyIncludeKnownItems;
+
+ public CachingRecommender(Recommender recommender) throws TasteException {
+ Preconditions.checkArgument(recommender != null, "recommender is null");
+ this.recommender = recommender;
+ maxHowMany = new int[]{1};
+ // Use "num users" as an upper limit on cache size. Rough guess.
+ int numUsers = recommender.getDataModel().getNumUsers();
+ recommendationsRetriever = new RecommendationRetriever();
+ recommendationCache = new Cache<>(recommendationsRetriever, numUsers);
+ estimatedPrefCache = new Cache<>(new EstimatedPrefRetriever(), numUsers);
+ refreshHelper = new RefreshHelper(new Callable<Object>() {
+ @Override
+ public Object call() {
+ clear();
+ return null;
+ }
+ });
+ refreshHelper.addDependency(recommender);
+ }
+
+ private void setCurrentRescorer(IDRescorer rescorer) {
+ if (rescorer == null) {
+ if (currentRescorer != null) {
+ currentRescorer = null;
+ clear();
+ }
+ } else {
+ if (!rescorer.equals(currentRescorer)) {
+ currentRescorer = rescorer;
+ clear();
+ }
+ }
+ }
+
+ public void setCurrentlyIncludeKnownItems(boolean currentlyIncludeKnownItems) {
+ this.currentlyIncludeKnownItems = currentlyIncludeKnownItems;
+ }
+
+ @Override
+ public List<RecommendedItem> recommend(long userID, int howMany) throws TasteException {
+ return recommend(userID, howMany, null, false);
+ }
+
+ @Override
+ public List<RecommendedItem> recommend(long userID, int howMany, boolean includeKnownItems) throws TasteException {
+ return recommend(userID, howMany, null, includeKnownItems);
+ }
+
+ @Override
+ public List<RecommendedItem> recommend(long userID, int howMany,IDRescorer rescorer) throws TasteException {
+ return recommend(userID, howMany, rescorer, false);
+ }
+
+ @Override
+ public List<RecommendedItem> recommend(long userID, int howMany,IDRescorer rescorer, boolean includeKnownItems)
+ throws TasteException {
+ Preconditions.checkArgument(howMany >= 1, "howMany must be at least 1");
+ synchronized (maxHowMany) {
+ if (howMany > maxHowMany[0]) {
+ maxHowMany[0] = howMany;
+ }
+ }
+
+ // Special case, avoid caching an anonymous user
+ if (userID == PlusAnonymousUserDataModel.TEMP_USER_ID) {
+ return recommendationsRetriever.get(PlusAnonymousUserDataModel.TEMP_USER_ID).getItems();
+ }
+
+ setCurrentRescorer(rescorer);
+ setCurrentlyIncludeKnownItems(includeKnownItems);
+
+ Recommendations recommendations = recommendationCache.get(userID);
+ if (recommendations.getItems().size() < howMany && !recommendations.isNoMoreRecommendableItems()) {
+ clear(userID);
+ recommendations = recommendationCache.get(userID);
+ if (recommendations.getItems().size() < howMany) {
+ recommendations.setNoMoreRecommendableItems(true);
+ }
+ }
+
+ List<RecommendedItem> recommendedItems = recommendations.getItems();
+ return recommendedItems.size() > howMany ? recommendedItems.subList(0, howMany) : recommendedItems;
+ }
+
+ @Override
+ public float estimatePreference(long userID, long itemID) throws TasteException {
+ return estimatedPrefCache.get(new LongPair(userID, itemID));
+ }
+
+ @Override
+ public void setPreference(long userID, long itemID, float value) throws TasteException {
+ recommender.setPreference(userID, itemID, value);
+ clear(userID);
+ }
+
+ @Override
+ public void removePreference(long userID, long itemID) throws TasteException {
+ recommender.removePreference(userID, itemID);
+ clear(userID);
+ }
+
+ @Override
+ public DataModel getDataModel() {
+ return recommender.getDataModel();
+ }
+
+ @Override
+ public void refresh(Collection<Refreshable> alreadyRefreshed) {
+ refreshHelper.refresh(alreadyRefreshed);
+ }
+
+ /**
+ * <p>
+ * Clears cached recommendations for the given user.
+ * </p>
+ *
+ * @param userID
+ * clear cached data associated with this user ID
+ */
+ public void clear(final long userID) {
+ log.debug("Clearing recommendations for user ID '{}'", userID);
+ recommendationCache.remove(userID);
+ estimatedPrefCache.removeKeysMatching(new Cache.MatchPredicate<LongPair>() {
+ @Override
+ public boolean matches(LongPair userItemPair) {
+ return userItemPair.getFirst() == userID;
+ }
+ });
+ }
+
+ /**
+ * <p>
+ * Clears all cached recommendations.
+ * </p>
+ */
+ public void clear() {
+ log.debug("Clearing all recommendations...");
+ recommendationCache.clear();
+ estimatedPrefCache.clear();
+ }
+
+ @Override
+ public String toString() {
+ return "CachingRecommender[recommender:" + recommender + ']';
+ }
+
+ private final class RecommendationRetriever implements Retriever<Long,Recommendations> {
+ @Override
+ public Recommendations get(Long key) throws TasteException {
+ log.debug("Retrieving new recommendations for user ID '{}'", key);
+ int howMany = maxHowMany[0];
+ IDRescorer rescorer = currentRescorer;
+ List<RecommendedItem> recommendations =
+ rescorer == null ? recommender.recommend(key, howMany, null, currentlyIncludeKnownItems) :
+ recommender.recommend(key, howMany, rescorer, currentlyIncludeKnownItems);
+ return new Recommendations(Collections.unmodifiableList(recommendations));
+ }
+ }
+
+ private final class EstimatedPrefRetriever implements Retriever<LongPair,Float> {
+ @Override
+ public Float get(LongPair key) throws TasteException {
+ long userID = key.getFirst();
+ long itemID = key.getSecond();
+ log.debug("Retrieving estimated preference for user ID '{}' and item ID '{}'", userID, itemID);
+ return recommender.estimatePreference(userID, itemID);
+ }
+ }
+
+ private static final class Recommendations {
+
+ private final List<RecommendedItem> items;
+ private boolean noMoreRecommendableItems;
+
+ private Recommendations(List<RecommendedItem> items) {
+ this.items = items;
+ }
+
+ List<RecommendedItem> getItems() {
+ return items;
+ }
+
+ boolean isNoMoreRecommendableItems() {
+ return noMoreRecommendableItems;
+ }
+
+ void setNoMoreRecommendableItems(boolean noMoreRecommendableItems) {
+ this.noMoreRecommendableItems = noMoreRecommendableItems;
+ }
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/EstimatedPreferenceCapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/EstimatedPreferenceCapper.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/EstimatedPreferenceCapper.java
new file mode 100644
index 0000000..f0f389f
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/EstimatedPreferenceCapper.java
@@ -0,0 +1,46 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.recommender;
+
+import org.apache.mahout.cf.taste.model.DataModel;
+
+/**
+ * Simple class which encapsulates restricting a preference value
+ * to a predefined range. The simple logic is wrapped up here for
+ * performance reasons.
+ */
+public final class EstimatedPreferenceCapper {
+
+ private final float min;
+ private final float max;
+
+ public EstimatedPreferenceCapper(DataModel model) {
+ min = model.getMinPreference();
+ max = model.getMaxPreference();
+ }
+
+ public float capEstimate(float estimate) {
+ if (estimate > max) {
+ estimate = max;
+ } else if (estimate < min) {
+ estimate = min;
+ }
+ return estimate;
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/GenericBooleanPrefItemBasedRecommender.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/GenericBooleanPrefItemBasedRecommender.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/GenericBooleanPrefItemBasedRecommender.java
new file mode 100644
index 0000000..40e21a3
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/GenericBooleanPrefItemBasedRecommender.java
@@ -0,0 +1,71 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.recommender;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.cf.taste.recommender.CandidateItemsStrategy;
+import org.apache.mahout.cf.taste.recommender.MostSimilarItemsCandidateItemsStrategy;
+import org.apache.mahout.cf.taste.similarity.ItemSimilarity;
+
+/**
+ * A variant on {@link GenericItemBasedRecommender} which is appropriate for use when no notion of preference
+ * value exists in the data.
+ *
+ * @see org.apache.mahout.cf.taste.impl.recommender.GenericBooleanPrefUserBasedRecommender
+ */
+public final class GenericBooleanPrefItemBasedRecommender extends GenericItemBasedRecommender {
+
+ public GenericBooleanPrefItemBasedRecommender(DataModel dataModel, ItemSimilarity similarity) {
+ super(dataModel, similarity);
+ }
+
+ public GenericBooleanPrefItemBasedRecommender(DataModel dataModel, ItemSimilarity similarity,
+ CandidateItemsStrategy candidateItemsStrategy, MostSimilarItemsCandidateItemsStrategy
+ mostSimilarItemsCandidateItemsStrategy) {
+ super(dataModel, similarity, candidateItemsStrategy, mostSimilarItemsCandidateItemsStrategy);
+ }
+
+ /**
+ * This computation is in a technical sense, wrong, since in the domain of "boolean preference users" where
+ * all preference values are 1, this method should only ever return 1.0 or NaN. This isn't terribly useful
+ * however since it means results can't be ranked by preference value (all are 1). So instead this returns a
+ * sum of similarities.
+ */
+ @Override
+ protected float doEstimatePreference(long userID, PreferenceArray preferencesFromUser, long itemID)
+ throws TasteException {
+ double[] similarities = getSimilarity().itemSimilarities(itemID, preferencesFromUser.getIDs());
+ boolean foundAPref = false;
+ double totalSimilarity = 0.0;
+ for (double theSimilarity : similarities) {
+ if (!Double.isNaN(theSimilarity)) {
+ foundAPref = true;
+ totalSimilarity += theSimilarity;
+ }
+ }
+ return foundAPref ? (float) totalSimilarity : Float.NaN;
+ }
+
+ @Override
+ public String toString() {
+ return "GenericBooleanPrefItemBasedRecommender";
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/GenericBooleanPrefUserBasedRecommender.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/GenericBooleanPrefUserBasedRecommender.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/GenericBooleanPrefUserBasedRecommender.java
new file mode 100644
index 0000000..15fcc9f
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/GenericBooleanPrefUserBasedRecommender.java
@@ -0,0 +1,82 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.recommender;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.neighborhood.UserNeighborhood;
+import org.apache.mahout.cf.taste.similarity.UserSimilarity;
+
+/**
+ * A variant on {@link GenericUserBasedRecommender} which is appropriate for use when no notion of preference
+ * value exists in the data.
+ */
+public final class GenericBooleanPrefUserBasedRecommender extends GenericUserBasedRecommender {
+
+ public GenericBooleanPrefUserBasedRecommender(DataModel dataModel,
+ UserNeighborhood neighborhood,
+ UserSimilarity similarity) {
+ super(dataModel, neighborhood, similarity);
+ }
+
+ /**
+ * This computation is in a technical sense, wrong, since in the domain of "boolean preference users" where
+ * all preference values are 1, this method should only ever return 1.0 or NaN. This isn't terribly useful
+ * however since it means results can't be ranked by preference value (all are 1). So instead this returns a
+ * sum of similarities to any other user in the neighborhood who has also rated the item.
+ */
+ @Override
+ protected float doEstimatePreference(long theUserID, long[] theNeighborhood, long itemID) throws TasteException {
+ if (theNeighborhood.length == 0) {
+ return Float.NaN;
+ }
+ DataModel dataModel = getDataModel();
+ UserSimilarity similarity = getSimilarity();
+ float totalSimilarity = 0.0f;
+ boolean foundAPref = false;
+ for (long userID : theNeighborhood) {
+ // See GenericItemBasedRecommender.doEstimatePreference() too
+ if (userID != theUserID && dataModel.getPreferenceValue(userID, itemID) != null) {
+ foundAPref = true;
+ totalSimilarity += (float) similarity.userSimilarity(theUserID, userID);
+ }
+ }
+ return foundAPref ? totalSimilarity : Float.NaN;
+ }
+
+ @Override
+ protected FastIDSet getAllOtherItems(long[] theNeighborhood, long theUserID, boolean includeKnownItems)
+ throws TasteException {
+ DataModel dataModel = getDataModel();
+ FastIDSet possibleItemIDs = new FastIDSet();
+ for (long userID : theNeighborhood) {
+ possibleItemIDs.addAll(dataModel.getItemIDsFromUser(userID));
+ }
+ if (!includeKnownItems) {
+ possibleItemIDs.removeAll(dataModel.getItemIDsFromUser(theUserID));
+ }
+ return possibleItemIDs;
+ }
+
+ @Override
+ public String toString() {
+ return "GenericBooleanPrefUserBasedRecommender";
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/GenericItemBasedRecommender.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/GenericItemBasedRecommender.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/GenericItemBasedRecommender.java
new file mode 100644
index 0000000..6dc8aa5
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/GenericItemBasedRecommender.java
@@ -0,0 +1,378 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.recommender;
+
+import java.util.Collection;
+import java.util.Collections;
+import java.util.List;
+import java.util.concurrent.Callable;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.impl.common.FullRunningAverage;
+import org.apache.mahout.cf.taste.impl.common.RefreshHelper;
+import org.apache.mahout.cf.taste.impl.common.RunningAverage;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.cf.taste.recommender.CandidateItemsStrategy;
+import org.apache.mahout.cf.taste.recommender.IDRescorer;
+import org.apache.mahout.cf.taste.recommender.ItemBasedRecommender;
+import org.apache.mahout.cf.taste.recommender.MostSimilarItemsCandidateItemsStrategy;
+import org.apache.mahout.cf.taste.recommender.RecommendedItem;
+import org.apache.mahout.cf.taste.recommender.Rescorer;
+import org.apache.mahout.cf.taste.similarity.ItemSimilarity;
+import org.apache.mahout.common.LongPair;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * <p>
+ * A simple {@link org.apache.mahout.cf.taste.recommender.Recommender} which uses a given
+ * {@link org.apache.mahout.cf.taste.model.DataModel} and
+ * {@link org.apache.mahout.cf.taste.similarity.ItemSimilarity} to produce recommendations. This class
+ * represents Taste's support for item-based recommenders.
+ * </p>
+ *
+ * <p>
+ * The {@link org.apache.mahout.cf.taste.similarity.ItemSimilarity} is the most important point to discuss
+ * here. Item-based recommenders are useful because they can take advantage of something to be very fast: they
+ * base their computations on item similarity, not user similarity, and item similarity is relatively static.
+ * It can be precomputed, instead of re-computed in real time.
+ * </p>
+ *
+ * <p>
+ * Thus it's strongly recommended that you use
+ * {@link org.apache.mahout.cf.taste.impl.similarity.GenericItemSimilarity} with pre-computed similarities if
+ * you're going to use this class. You can use
+ * {@link org.apache.mahout.cf.taste.impl.similarity.PearsonCorrelationSimilarity} too, which computes
+ * similarities in real-time, but will probably find this painfully slow for large amounts of data.
+ * </p>
+ */
+public class GenericItemBasedRecommender extends AbstractRecommender implements ItemBasedRecommender {
+
+ private static final Logger log = LoggerFactory.getLogger(GenericItemBasedRecommender.class);
+
+ private final ItemSimilarity similarity;
+ private final MostSimilarItemsCandidateItemsStrategy mostSimilarItemsCandidateItemsStrategy;
+ private final RefreshHelper refreshHelper;
+ private EstimatedPreferenceCapper capper;
+
+ private static final boolean EXCLUDE_ITEM_IF_NOT_SIMILAR_TO_ALL_BY_DEFAULT = true;
+
+ public GenericItemBasedRecommender(DataModel dataModel,
+ ItemSimilarity similarity,
+ CandidateItemsStrategy candidateItemsStrategy,
+ MostSimilarItemsCandidateItemsStrategy mostSimilarItemsCandidateItemsStrategy) {
+ super(dataModel, candidateItemsStrategy);
+ Preconditions.checkArgument(similarity != null, "similarity is null");
+ this.similarity = similarity;
+ Preconditions.checkArgument(mostSimilarItemsCandidateItemsStrategy != null,
+ "mostSimilarItemsCandidateItemsStrategy is null");
+ this.mostSimilarItemsCandidateItemsStrategy = mostSimilarItemsCandidateItemsStrategy;
+ this.refreshHelper = new RefreshHelper(new Callable<Void>() {
+ @Override
+ public Void call() {
+ capper = buildCapper();
+ return null;
+ }
+ });
+ refreshHelper.addDependency(dataModel);
+ refreshHelper.addDependency(similarity);
+ refreshHelper.addDependency(candidateItemsStrategy);
+ refreshHelper.addDependency(mostSimilarItemsCandidateItemsStrategy);
+ capper = buildCapper();
+ }
+
+ public GenericItemBasedRecommender(DataModel dataModel, ItemSimilarity similarity) {
+ this(dataModel,
+ similarity,
+ AbstractRecommender.getDefaultCandidateItemsStrategy(),
+ getDefaultMostSimilarItemsCandidateItemsStrategy());
+ }
+
+ protected static MostSimilarItemsCandidateItemsStrategy getDefaultMostSimilarItemsCandidateItemsStrategy() {
+ return new PreferredItemsNeighborhoodCandidateItemsStrategy();
+ }
+
+ public ItemSimilarity getSimilarity() {
+ return similarity;
+ }
+
+ @Override
+ public List<RecommendedItem> recommend(long userID, int howMany, IDRescorer rescorer, boolean includeKnownItems)
+ throws TasteException {
+ Preconditions.checkArgument(howMany >= 1, "howMany must be at least 1");
+ log.debug("Recommending items for user ID '{}'", userID);
+
+ PreferenceArray preferencesFromUser = getDataModel().getPreferencesFromUser(userID);
+ if (preferencesFromUser.length() == 0) {
+ return Collections.emptyList();
+ }
+
+ FastIDSet possibleItemIDs = getAllOtherItems(userID, preferencesFromUser, includeKnownItems);
+
+ TopItems.Estimator<Long> estimator = new Estimator(userID, preferencesFromUser);
+
+ List<RecommendedItem> topItems = TopItems.getTopItems(howMany, possibleItemIDs.iterator(), rescorer,
+ estimator);
+
+ log.debug("Recommendations are: {}", topItems);
+ return topItems;
+ }
+
+ @Override
+ public float estimatePreference(long userID, long itemID) throws TasteException {
+ PreferenceArray preferencesFromUser = getDataModel().getPreferencesFromUser(userID);
+ Float actualPref = getPreferenceForItem(preferencesFromUser, itemID);
+ if (actualPref != null) {
+ return actualPref;
+ }
+ return doEstimatePreference(userID, preferencesFromUser, itemID);
+ }
+
+ private static Float getPreferenceForItem(PreferenceArray preferencesFromUser, long itemID) {
+ int size = preferencesFromUser.length();
+ for (int i = 0; i < size; i++) {
+ if (preferencesFromUser.getItemID(i) == itemID) {
+ return preferencesFromUser.getValue(i);
+ }
+ }
+ return null;
+ }
+
+ @Override
+ public List<RecommendedItem> mostSimilarItems(long itemID, int howMany) throws TasteException {
+ return mostSimilarItems(itemID, howMany, null);
+ }
+
+ @Override
+ public List<RecommendedItem> mostSimilarItems(long itemID, int howMany,
+ Rescorer<LongPair> rescorer) throws TasteException {
+ TopItems.Estimator<Long> estimator = new MostSimilarEstimator(itemID, similarity, rescorer);
+ return doMostSimilarItems(new long[] {itemID}, howMany, estimator);
+ }
+
+ @Override
+ public List<RecommendedItem> mostSimilarItems(long[] itemIDs, int howMany) throws TasteException {
+ TopItems.Estimator<Long> estimator = new MultiMostSimilarEstimator(itemIDs, similarity, null,
+ EXCLUDE_ITEM_IF_NOT_SIMILAR_TO_ALL_BY_DEFAULT);
+ return doMostSimilarItems(itemIDs, howMany, estimator);
+ }
+
+ @Override
+ public List<RecommendedItem> mostSimilarItems(long[] itemIDs, int howMany,
+ Rescorer<LongPair> rescorer) throws TasteException {
+ TopItems.Estimator<Long> estimator = new MultiMostSimilarEstimator(itemIDs, similarity, rescorer,
+ EXCLUDE_ITEM_IF_NOT_SIMILAR_TO_ALL_BY_DEFAULT);
+ return doMostSimilarItems(itemIDs, howMany, estimator);
+ }
+
+ @Override
+ public List<RecommendedItem> mostSimilarItems(long[] itemIDs,
+ int howMany,
+ boolean excludeItemIfNotSimilarToAll) throws TasteException {
+ TopItems.Estimator<Long> estimator = new MultiMostSimilarEstimator(itemIDs, similarity, null,
+ excludeItemIfNotSimilarToAll);
+ return doMostSimilarItems(itemIDs, howMany, estimator);
+ }
+
+ @Override
+ public List<RecommendedItem> mostSimilarItems(long[] itemIDs, int howMany,
+ Rescorer<LongPair> rescorer,
+ boolean excludeItemIfNotSimilarToAll) throws TasteException {
+ TopItems.Estimator<Long> estimator = new MultiMostSimilarEstimator(itemIDs, similarity, rescorer,
+ excludeItemIfNotSimilarToAll);
+ return doMostSimilarItems(itemIDs, howMany, estimator);
+ }
+
+ @Override
+ public List<RecommendedItem> recommendedBecause(long userID, long itemID, int howMany) throws TasteException {
+ Preconditions.checkArgument(howMany >= 1, "howMany must be at least 1");
+
+ DataModel model = getDataModel();
+ TopItems.Estimator<Long> estimator = new RecommendedBecauseEstimator(userID, itemID);
+
+ PreferenceArray prefs = model.getPreferencesFromUser(userID);
+ int size = prefs.length();
+ FastIDSet allUserItems = new FastIDSet(size);
+ for (int i = 0; i < size; i++) {
+ allUserItems.add(prefs.getItemID(i));
+ }
+ allUserItems.remove(itemID);
+
+ return TopItems.getTopItems(howMany, allUserItems.iterator(), null, estimator);
+ }
+
+ private List<RecommendedItem> doMostSimilarItems(long[] itemIDs,
+ int howMany,
+ TopItems.Estimator<Long> estimator) throws TasteException {
+ FastIDSet possibleItemIDs = mostSimilarItemsCandidateItemsStrategy.getCandidateItems(itemIDs, getDataModel());
+ return TopItems.getTopItems(howMany, possibleItemIDs.iterator(), null, estimator);
+ }
+
+ protected float doEstimatePreference(long userID, PreferenceArray preferencesFromUser, long itemID)
+ throws TasteException {
+ double preference = 0.0;
+ double totalSimilarity = 0.0;
+ int count = 0;
+ double[] similarities = similarity.itemSimilarities(itemID, preferencesFromUser.getIDs());
+ for (int i = 0; i < similarities.length; i++) {
+ double theSimilarity = similarities[i];
+ if (!Double.isNaN(theSimilarity)) {
+ // Weights can be negative!
+ preference += theSimilarity * preferencesFromUser.getValue(i);
+ totalSimilarity += theSimilarity;
+ count++;
+ }
+ }
+ // Throw out the estimate if it was based on no data points, of course, but also if based on
+ // just one. This is a bit of a band-aid on the 'stock' item-based algorithm for the moment.
+ // The reason is that in this case the estimate is, simply, the user's rating for one item
+ // that happened to have a defined similarity. The similarity score doesn't matter, and that
+ // seems like a bad situation.
+ if (count <= 1) {
+ return Float.NaN;
+ }
+ float estimate = (float) (preference / totalSimilarity);
+ if (capper != null) {
+ estimate = capper.capEstimate(estimate);
+ }
+ return estimate;
+ }
+
+ @Override
+ public void refresh(Collection<Refreshable> alreadyRefreshed) {
+ refreshHelper.refresh(alreadyRefreshed);
+ }
+
+ @Override
+ public String toString() {
+ return "GenericItemBasedRecommender[similarity:" + similarity + ']';
+ }
+
+ private EstimatedPreferenceCapper buildCapper() {
+ DataModel dataModel = getDataModel();
+ if (Float.isNaN(dataModel.getMinPreference()) && Float.isNaN(dataModel.getMaxPreference())) {
+ return null;
+ } else {
+ return new EstimatedPreferenceCapper(dataModel);
+ }
+ }
+
+ public static class MostSimilarEstimator implements TopItems.Estimator<Long> {
+
+ private final long toItemID;
+ private final ItemSimilarity similarity;
+ private final Rescorer<LongPair> rescorer;
+
+ public MostSimilarEstimator(long toItemID, ItemSimilarity similarity, Rescorer<LongPair> rescorer) {
+ this.toItemID = toItemID;
+ this.similarity = similarity;
+ this.rescorer = rescorer;
+ }
+
+ @Override
+ public double estimate(Long itemID) throws TasteException {
+ LongPair pair = new LongPair(toItemID, itemID);
+ if (rescorer != null && rescorer.isFiltered(pair)) {
+ return Double.NaN;
+ }
+ double originalEstimate = similarity.itemSimilarity(toItemID, itemID);
+ return rescorer == null ? originalEstimate : rescorer.rescore(pair, originalEstimate);
+ }
+ }
+
+ private final class Estimator implements TopItems.Estimator<Long> {
+
+ private final long userID;
+ private final PreferenceArray preferencesFromUser;
+
+ private Estimator(long userID, PreferenceArray preferencesFromUser) {
+ this.userID = userID;
+ this.preferencesFromUser = preferencesFromUser;
+ }
+
+ @Override
+ public double estimate(Long itemID) throws TasteException {
+ return doEstimatePreference(userID, preferencesFromUser, itemID);
+ }
+ }
+
+ private static final class MultiMostSimilarEstimator implements TopItems.Estimator<Long> {
+
+ private final long[] toItemIDs;
+ private final ItemSimilarity similarity;
+ private final Rescorer<LongPair> rescorer;
+ private final boolean excludeItemIfNotSimilarToAll;
+
+ private MultiMostSimilarEstimator(long[] toItemIDs, ItemSimilarity similarity, Rescorer<LongPair> rescorer,
+ boolean excludeItemIfNotSimilarToAll) {
+ this.toItemIDs = toItemIDs;
+ this.similarity = similarity;
+ this.rescorer = rescorer;
+ this.excludeItemIfNotSimilarToAll = excludeItemIfNotSimilarToAll;
+ }
+
+ @Override
+ public double estimate(Long itemID) throws TasteException {
+ RunningAverage average = new FullRunningAverage();
+ double[] similarities = similarity.itemSimilarities(itemID, toItemIDs);
+ for (int i = 0; i < toItemIDs.length; i++) {
+ long toItemID = toItemIDs[i];
+ LongPair pair = new LongPair(toItemID, itemID);
+ if (rescorer != null && rescorer.isFiltered(pair)) {
+ continue;
+ }
+ double estimate = similarities[i];
+ if (rescorer != null) {
+ estimate = rescorer.rescore(pair, estimate);
+ }
+ if (excludeItemIfNotSimilarToAll || !Double.isNaN(estimate)) {
+ average.addDatum(estimate);
+ }
+ }
+ double averageEstimate = average.getAverage();
+ return averageEstimate == 0 ? Double.NaN : averageEstimate;
+ }
+ }
+
+ private final class RecommendedBecauseEstimator implements TopItems.Estimator<Long> {
+
+ private final long userID;
+ private final long recommendedItemID;
+
+ private RecommendedBecauseEstimator(long userID, long recommendedItemID) {
+ this.userID = userID;
+ this.recommendedItemID = recommendedItemID;
+ }
+
+ @Override
+ public double estimate(Long itemID) throws TasteException {
+ Float pref = getDataModel().getPreferenceValue(userID, itemID);
+ if (pref == null) {
+ return Float.NaN;
+ }
+ double similarityValue = similarity.itemSimilarity(recommendedItemID, itemID);
+ return (1.0 + similarityValue) * pref;
+ }
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/GenericRecommendedItem.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/GenericRecommendedItem.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/GenericRecommendedItem.java
new file mode 100644
index 0000000..8c8f6ce
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/GenericRecommendedItem.java
@@ -0,0 +1,76 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.recommender;
+
+import java.io.Serializable;
+
+import org.apache.mahout.cf.taste.recommender.RecommendedItem;
+import org.apache.mahout.common.RandomUtils;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * <p>
+ * A simple implementation of {@link RecommendedItem}.
+ * </p>
+ */
+public final class GenericRecommendedItem implements RecommendedItem, Serializable {
+
+ private final long itemID;
+ private final float value;
+
+ /**
+ * @throws IllegalArgumentException
+ * if item is null or value is NaN
+ */
+ public GenericRecommendedItem(long itemID, float value) {
+ Preconditions.checkArgument(!Float.isNaN(value), "value is NaN");
+ this.itemID = itemID;
+ this.value = value;
+ }
+
+ @Override
+ public long getItemID() {
+ return itemID;
+ }
+
+ @Override
+ public float getValue() {
+ return value;
+ }
+
+ @Override
+ public String toString() {
+ return "RecommendedItem[item:" + itemID + ", value:" + value + ']';
+ }
+
+ @Override
+ public int hashCode() {
+ return (int) itemID ^ RandomUtils.hashFloat(value);
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (!(o instanceof GenericRecommendedItem)) {
+ return false;
+ }
+ RecommendedItem other = (RecommendedItem) o;
+ return itemID == other.getItemID() && value == other.getValue();
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/GenericUserBasedRecommender.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/GenericUserBasedRecommender.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/GenericUserBasedRecommender.java
new file mode 100644
index 0000000..1e2ef73
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/GenericUserBasedRecommender.java
@@ -0,0 +1,247 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.recommender;
+
+import java.util.Collection;
+import java.util.Collections;
+import java.util.List;
+import java.util.concurrent.Callable;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.impl.common.RefreshHelper;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.neighborhood.UserNeighborhood;
+import org.apache.mahout.cf.taste.recommender.IDRescorer;
+import org.apache.mahout.cf.taste.recommender.RecommendedItem;
+import org.apache.mahout.cf.taste.recommender.Rescorer;
+import org.apache.mahout.cf.taste.recommender.UserBasedRecommender;
+import org.apache.mahout.cf.taste.similarity.UserSimilarity;
+import org.apache.mahout.common.LongPair;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * <p>
+ * A simple {@link org.apache.mahout.cf.taste.recommender.Recommender}
+ * which uses a given {@link DataModel} and {@link UserNeighborhood} to produce recommendations.
+ * </p>
+ */
+public class GenericUserBasedRecommender extends AbstractRecommender implements UserBasedRecommender {
+
+ private static final Logger log = LoggerFactory.getLogger(GenericUserBasedRecommender.class);
+
+ private final UserNeighborhood neighborhood;
+ private final UserSimilarity similarity;
+ private final RefreshHelper refreshHelper;
+ private EstimatedPreferenceCapper capper;
+
+ public GenericUserBasedRecommender(DataModel dataModel,
+ UserNeighborhood neighborhood,
+ UserSimilarity similarity) {
+ super(dataModel);
+ Preconditions.checkArgument(neighborhood != null, "neighborhood is null");
+ this.neighborhood = neighborhood;
+ this.similarity = similarity;
+ this.refreshHelper = new RefreshHelper(new Callable<Void>() {
+ @Override
+ public Void call() {
+ capper = buildCapper();
+ return null;
+ }
+ });
+ refreshHelper.addDependency(dataModel);
+ refreshHelper.addDependency(similarity);
+ refreshHelper.addDependency(neighborhood);
+ capper = buildCapper();
+ }
+
+ public UserSimilarity getSimilarity() {
+ return similarity;
+ }
+
+ @Override
+ public List<RecommendedItem> recommend(long userID, int howMany, IDRescorer rescorer, boolean includeKnownItems)
+ throws TasteException {
+ Preconditions.checkArgument(howMany >= 1, "howMany must be at least 1");
+
+ log.debug("Recommending items for user ID '{}'", userID);
+
+ long[] theNeighborhood = neighborhood.getUserNeighborhood(userID);
+
+ if (theNeighborhood.length == 0) {
+ return Collections.emptyList();
+ }
+
+ FastIDSet allItemIDs = getAllOtherItems(theNeighborhood, userID, includeKnownItems);
+
+ TopItems.Estimator<Long> estimator = new Estimator(userID, theNeighborhood);
+
+ List<RecommendedItem> topItems = TopItems
+ .getTopItems(howMany, allItemIDs.iterator(), rescorer, estimator);
+
+ log.debug("Recommendations are: {}", topItems);
+ return topItems;
+ }
+
+ @Override
+ public float estimatePreference(long userID, long itemID) throws TasteException {
+ DataModel model = getDataModel();
+ Float actualPref = model.getPreferenceValue(userID, itemID);
+ if (actualPref != null) {
+ return actualPref;
+ }
+ long[] theNeighborhood = neighborhood.getUserNeighborhood(userID);
+ return doEstimatePreference(userID, theNeighborhood, itemID);
+ }
+
+ @Override
+ public long[] mostSimilarUserIDs(long userID, int howMany) throws TasteException {
+ return mostSimilarUserIDs(userID, howMany, null);
+ }
+
+ @Override
+ public long[] mostSimilarUserIDs(long userID, int howMany, Rescorer<LongPair> rescorer) throws TasteException {
+ TopItems.Estimator<Long> estimator = new MostSimilarEstimator(userID, similarity, rescorer);
+ return doMostSimilarUsers(howMany, estimator);
+ }
+
+ private long[] doMostSimilarUsers(int howMany, TopItems.Estimator<Long> estimator) throws TasteException {
+ DataModel model = getDataModel();
+ return TopItems.getTopUsers(howMany, model.getUserIDs(), null, estimator);
+ }
+
+ protected float doEstimatePreference(long theUserID, long[] theNeighborhood, long itemID) throws TasteException {
+ if (theNeighborhood.length == 0) {
+ return Float.NaN;
+ }
+ DataModel dataModel = getDataModel();
+ double preference = 0.0;
+ double totalSimilarity = 0.0;
+ int count = 0;
+ for (long userID : theNeighborhood) {
+ if (userID != theUserID) {
+ // See GenericItemBasedRecommender.doEstimatePreference() too
+ Float pref = dataModel.getPreferenceValue(userID, itemID);
+ if (pref != null) {
+ double theSimilarity = similarity.userSimilarity(theUserID, userID);
+ if (!Double.isNaN(theSimilarity)) {
+ preference += theSimilarity * pref;
+ totalSimilarity += theSimilarity;
+ count++;
+ }
+ }
+ }
+ }
+ // Throw out the estimate if it was based on no data points, of course, but also if based on
+ // just one. This is a bit of a band-aid on the 'stock' item-based algorithm for the moment.
+ // The reason is that in this case the estimate is, simply, the user's rating for one item
+ // that happened to have a defined similarity. The similarity score doesn't matter, and that
+ // seems like a bad situation.
+ if (count <= 1) {
+ return Float.NaN;
+ }
+ float estimate = (float) (preference / totalSimilarity);
+ if (capper != null) {
+ estimate = capper.capEstimate(estimate);
+ }
+ return estimate;
+ }
+
+ protected FastIDSet getAllOtherItems(long[] theNeighborhood, long theUserID, boolean includeKnownItems)
+ throws TasteException {
+ DataModel dataModel = getDataModel();
+ FastIDSet possibleItemIDs = new FastIDSet();
+ for (long userID : theNeighborhood) {
+ possibleItemIDs.addAll(dataModel.getItemIDsFromUser(userID));
+ }
+ if (!includeKnownItems) {
+ possibleItemIDs.removeAll(dataModel.getItemIDsFromUser(theUserID));
+ }
+ return possibleItemIDs;
+ }
+
+ @Override
+ public void refresh(Collection<Refreshable> alreadyRefreshed) {
+ refreshHelper.refresh(alreadyRefreshed);
+ }
+
+ @Override
+ public String toString() {
+ return "GenericUserBasedRecommender[neighborhood:" + neighborhood + ']';
+ }
+
+ private EstimatedPreferenceCapper buildCapper() {
+ DataModel dataModel = getDataModel();
+ if (Float.isNaN(dataModel.getMinPreference()) && Float.isNaN(dataModel.getMaxPreference())) {
+ return null;
+ } else {
+ return new EstimatedPreferenceCapper(dataModel);
+ }
+ }
+
+ private static final class MostSimilarEstimator implements TopItems.Estimator<Long> {
+
+ private final long toUserID;
+ private final UserSimilarity similarity;
+ private final Rescorer<LongPair> rescorer;
+
+ private MostSimilarEstimator(long toUserID, UserSimilarity similarity, Rescorer<LongPair> rescorer) {
+ this.toUserID = toUserID;
+ this.similarity = similarity;
+ this.rescorer = rescorer;
+ }
+
+ @Override
+ public double estimate(Long userID) throws TasteException {
+ // Don't consider the user itself as a possible most similar user
+ if (userID == toUserID) {
+ return Double.NaN;
+ }
+ if (rescorer == null) {
+ return similarity.userSimilarity(toUserID, userID);
+ } else {
+ LongPair pair = new LongPair(toUserID, userID);
+ if (rescorer.isFiltered(pair)) {
+ return Double.NaN;
+ }
+ double originalEstimate = similarity.userSimilarity(toUserID, userID);
+ return rescorer.rescore(pair, originalEstimate);
+ }
+ }
+ }
+
+ private final class Estimator implements TopItems.Estimator<Long> {
+
+ private final long theUserID;
+ private final long[] theNeighborhood;
+
+ Estimator(long theUserID, long[] theNeighborhood) {
+ this.theUserID = theUserID;
+ this.theNeighborhood = theNeighborhood;
+ }
+
+ @Override
+ public double estimate(Long itemID) throws TasteException {
+ return doEstimatePreference(theUserID, theNeighborhood, itemID);
+ }
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/ItemAverageRecommender.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/ItemAverageRecommender.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/ItemAverageRecommender.java
new file mode 100644
index 0000000..618c65f
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/ItemAverageRecommender.java
@@ -0,0 +1,199 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.recommender;
+
+import java.util.Collection;
+import java.util.List;
+import java.util.concurrent.Callable;
+import java.util.concurrent.locks.ReadWriteLock;
+import java.util.concurrent.locks.ReentrantReadWriteLock;
+
+import org.apache.mahout.cf.taste.common.NoSuchUserException;
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.impl.common.FullRunningAverage;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+import org.apache.mahout.cf.taste.impl.common.RefreshHelper;
+import org.apache.mahout.cf.taste.impl.common.RunningAverage;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.cf.taste.recommender.IDRescorer;
+import org.apache.mahout.cf.taste.recommender.RecommendedItem;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * <p>
+ * A simple recommender that always estimates preference for an item to be the average of all known preference
+ * values for that item. No information about users is taken into account. This implementation is provided for
+ * experimentation; while simple and fast, it may not produce very good recommendations.
+ * </p>
+ */
+public final class ItemAverageRecommender extends AbstractRecommender {
+
+ private static final Logger log = LoggerFactory.getLogger(ItemAverageRecommender.class);
+
+ private final FastByIDMap<RunningAverage> itemAverages;
+ private final ReadWriteLock buildAveragesLock;
+ private final RefreshHelper refreshHelper;
+
+ public ItemAverageRecommender(DataModel dataModel) throws TasteException {
+ super(dataModel);
+ this.itemAverages = new FastByIDMap<>();
+ this.buildAveragesLock = new ReentrantReadWriteLock();
+ this.refreshHelper = new RefreshHelper(new Callable<Object>() {
+ @Override
+ public Object call() throws TasteException {
+ buildAverageDiffs();
+ return null;
+ }
+ });
+ refreshHelper.addDependency(dataModel);
+ buildAverageDiffs();
+ }
+
+ @Override
+ public List<RecommendedItem> recommend(long userID, int howMany, IDRescorer rescorer, boolean includeKnownItems)
+ throws TasteException {
+ Preconditions.checkArgument(howMany >= 1, "howMany must be at least 1");
+ log.debug("Recommending items for user ID '{}'", userID);
+
+ PreferenceArray preferencesFromUser = getDataModel().getPreferencesFromUser(userID);
+ FastIDSet possibleItemIDs = getAllOtherItems(userID, preferencesFromUser, includeKnownItems);
+
+ TopItems.Estimator<Long> estimator = new Estimator();
+
+ List<RecommendedItem> topItems = TopItems.getTopItems(howMany, possibleItemIDs.iterator(), rescorer,
+ estimator);
+
+ log.debug("Recommendations are: {}", topItems);
+ return topItems;
+ }
+
+ @Override
+ public float estimatePreference(long userID, long itemID) throws TasteException {
+ DataModel dataModel = getDataModel();
+ Float actualPref = dataModel.getPreferenceValue(userID, itemID);
+ if (actualPref != null) {
+ return actualPref;
+ }
+ return doEstimatePreference(itemID);
+ }
+
+ private float doEstimatePreference(long itemID) {
+ buildAveragesLock.readLock().lock();
+ try {
+ RunningAverage average = itemAverages.get(itemID);
+ return average == null ? Float.NaN : (float) average.getAverage();
+ } finally {
+ buildAveragesLock.readLock().unlock();
+ }
+ }
+
+ private void buildAverageDiffs() throws TasteException {
+ try {
+ buildAveragesLock.writeLock().lock();
+ DataModel dataModel = getDataModel();
+ LongPrimitiveIterator it = dataModel.getUserIDs();
+ while (it.hasNext()) {
+ PreferenceArray prefs = dataModel.getPreferencesFromUser(it.nextLong());
+ int size = prefs.length();
+ for (int i = 0; i < size; i++) {
+ long itemID = prefs.getItemID(i);
+ RunningAverage average = itemAverages.get(itemID);
+ if (average == null) {
+ average = new FullRunningAverage();
+ itemAverages.put(itemID, average);
+ }
+ average.addDatum(prefs.getValue(i));
+ }
+ }
+ } finally {
+ buildAveragesLock.writeLock().unlock();
+ }
+ }
+
+ @Override
+ public void setPreference(long userID, long itemID, float value) throws TasteException {
+ DataModel dataModel = getDataModel();
+ double prefDelta;
+ try {
+ Float oldPref = dataModel.getPreferenceValue(userID, itemID);
+ prefDelta = oldPref == null ? value : value - oldPref;
+ } catch (NoSuchUserException nsee) {
+ prefDelta = value;
+ }
+ super.setPreference(userID, itemID, value);
+ try {
+ buildAveragesLock.writeLock().lock();
+ RunningAverage average = itemAverages.get(itemID);
+ if (average == null) {
+ RunningAverage newAverage = new FullRunningAverage();
+ newAverage.addDatum(prefDelta);
+ itemAverages.put(itemID, newAverage);
+ } else {
+ average.changeDatum(prefDelta);
+ }
+ } finally {
+ buildAveragesLock.writeLock().unlock();
+ }
+ }
+
+ @Override
+ public void removePreference(long userID, long itemID) throws TasteException {
+ DataModel dataModel = getDataModel();
+ Float oldPref = dataModel.getPreferenceValue(userID, itemID);
+ super.removePreference(userID, itemID);
+ if (oldPref != null) {
+ try {
+ buildAveragesLock.writeLock().lock();
+ RunningAverage average = itemAverages.get(itemID);
+ if (average == null) {
+ throw new IllegalStateException("No preferences exist for item ID: " + itemID);
+ } else {
+ average.removeDatum(oldPref);
+ }
+ } finally {
+ buildAveragesLock.writeLock().unlock();
+ }
+ }
+ }
+
+ @Override
+ public void refresh(Collection<Refreshable> alreadyRefreshed) {
+ refreshHelper.refresh(alreadyRefreshed);
+ }
+
+ @Override
+ public String toString() {
+ return "ItemAverageRecommender";
+ }
+
+ private final class Estimator implements TopItems.Estimator<Long> {
+
+ @Override
+ public double estimate(Long itemID) {
+ return doEstimatePreference(itemID);
+ }
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/ItemUserAverageRecommender.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/ItemUserAverageRecommender.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/ItemUserAverageRecommender.java
new file mode 100644
index 0000000..b2bcd24
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/ItemUserAverageRecommender.java
@@ -0,0 +1,240 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.recommender;
+
+import java.util.Collection;
+import java.util.List;
+import java.util.concurrent.Callable;
+import java.util.concurrent.locks.ReadWriteLock;
+import java.util.concurrent.locks.ReentrantReadWriteLock;
+
+import org.apache.mahout.cf.taste.common.NoSuchUserException;
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.impl.common.FullRunningAverage;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+import org.apache.mahout.cf.taste.impl.common.RefreshHelper;
+import org.apache.mahout.cf.taste.impl.common.RunningAverage;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.cf.taste.recommender.IDRescorer;
+import org.apache.mahout.cf.taste.recommender.RecommendedItem;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * <p>
+ * Like {@link ItemAverageRecommender}, except that estimated preferences are adjusted for the users' average
+ * preference value. For example, say user X has not rated item Y. Item Y's average preference value is 3.5.
+ * User X's average preference value is 4.2, and the average over all preference values is 4.0. User X prefers
+ * items 0.2 higher on average, so, the estimated preference for user X, item Y is 3.5 + 0.2 = 3.7.
+ * </p>
+ */
+public final class ItemUserAverageRecommender extends AbstractRecommender {
+
+ private static final Logger log = LoggerFactory.getLogger(ItemUserAverageRecommender.class);
+
+ private final FastByIDMap<RunningAverage> itemAverages;
+ private final FastByIDMap<RunningAverage> userAverages;
+ private final RunningAverage overallAveragePrefValue;
+ private final ReadWriteLock buildAveragesLock;
+ private final RefreshHelper refreshHelper;
+
+ public ItemUserAverageRecommender(DataModel dataModel) throws TasteException {
+ super(dataModel);
+ this.itemAverages = new FastByIDMap<>();
+ this.userAverages = new FastByIDMap<>();
+ this.overallAveragePrefValue = new FullRunningAverage();
+ this.buildAveragesLock = new ReentrantReadWriteLock();
+ this.refreshHelper = new RefreshHelper(new Callable<Object>() {
+ @Override
+ public Object call() throws TasteException {
+ buildAverageDiffs();
+ return null;
+ }
+ });
+ refreshHelper.addDependency(dataModel);
+ buildAverageDiffs();
+ }
+
+ @Override
+ public List<RecommendedItem> recommend(long userID, int howMany, IDRescorer rescorer, boolean includeKnownItems)
+ throws TasteException {
+ Preconditions.checkArgument(howMany >= 1, "howMany must be at least 1");
+ log.debug("Recommending items for user ID '{}'", userID);
+
+ PreferenceArray preferencesFromUser = getDataModel().getPreferencesFromUser(userID);
+ FastIDSet possibleItemIDs = getAllOtherItems(userID, preferencesFromUser, includeKnownItems);
+
+ TopItems.Estimator<Long> estimator = new Estimator(userID);
+
+ List<RecommendedItem> topItems = TopItems.getTopItems(howMany, possibleItemIDs.iterator(), rescorer,
+ estimator);
+
+ log.debug("Recommendations are: {}", topItems);
+ return topItems;
+ }
+
+ @Override
+ public float estimatePreference(long userID, long itemID) throws TasteException {
+ DataModel dataModel = getDataModel();
+ Float actualPref = dataModel.getPreferenceValue(userID, itemID);
+ if (actualPref != null) {
+ return actualPref;
+ }
+ return doEstimatePreference(userID, itemID);
+ }
+
+ private float doEstimatePreference(long userID, long itemID) {
+ buildAveragesLock.readLock().lock();
+ try {
+ RunningAverage itemAverage = itemAverages.get(itemID);
+ if (itemAverage == null) {
+ return Float.NaN;
+ }
+ RunningAverage userAverage = userAverages.get(userID);
+ if (userAverage == null) {
+ return Float.NaN;
+ }
+ double userDiff = userAverage.getAverage() - overallAveragePrefValue.getAverage();
+ return (float) (itemAverage.getAverage() + userDiff);
+ } finally {
+ buildAveragesLock.readLock().unlock();
+ }
+ }
+
+ private void buildAverageDiffs() throws TasteException {
+ try {
+ buildAveragesLock.writeLock().lock();
+ DataModel dataModel = getDataModel();
+ LongPrimitiveIterator it = dataModel.getUserIDs();
+ while (it.hasNext()) {
+ long userID = it.nextLong();
+ PreferenceArray prefs = dataModel.getPreferencesFromUser(userID);
+ int size = prefs.length();
+ for (int i = 0; i < size; i++) {
+ long itemID = prefs.getItemID(i);
+ float value = prefs.getValue(i);
+ addDatumAndCreateIfNeeded(itemID, value, itemAverages);
+ addDatumAndCreateIfNeeded(userID, value, userAverages);
+ overallAveragePrefValue.addDatum(value);
+ }
+ }
+ } finally {
+ buildAveragesLock.writeLock().unlock();
+ }
+ }
+
+ private static void addDatumAndCreateIfNeeded(long itemID, float value, FastByIDMap<RunningAverage> averages) {
+ RunningAverage itemAverage = averages.get(itemID);
+ if (itemAverage == null) {
+ itemAverage = new FullRunningAverage();
+ averages.put(itemID, itemAverage);
+ }
+ itemAverage.addDatum(value);
+ }
+
+ @Override
+ public void setPreference(long userID, long itemID, float value) throws TasteException {
+ DataModel dataModel = getDataModel();
+ double prefDelta;
+ try {
+ Float oldPref = dataModel.getPreferenceValue(userID, itemID);
+ prefDelta = oldPref == null ? value : value - oldPref;
+ } catch (NoSuchUserException nsee) {
+ prefDelta = value;
+ }
+ super.setPreference(userID, itemID, value);
+ try {
+ buildAveragesLock.writeLock().lock();
+ RunningAverage itemAverage = itemAverages.get(itemID);
+ if (itemAverage == null) {
+ RunningAverage newItemAverage = new FullRunningAverage();
+ newItemAverage.addDatum(prefDelta);
+ itemAverages.put(itemID, newItemAverage);
+ } else {
+ itemAverage.changeDatum(prefDelta);
+ }
+ RunningAverage userAverage = userAverages.get(userID);
+ if (userAverage == null) {
+ RunningAverage newUserAveragae = new FullRunningAverage();
+ newUserAveragae.addDatum(prefDelta);
+ userAverages.put(userID, newUserAveragae);
+ } else {
+ userAverage.changeDatum(prefDelta);
+ }
+ overallAveragePrefValue.changeDatum(prefDelta);
+ } finally {
+ buildAveragesLock.writeLock().unlock();
+ }
+ }
+
+ @Override
+ public void removePreference(long userID, long itemID) throws TasteException {
+ DataModel dataModel = getDataModel();
+ Float oldPref = dataModel.getPreferenceValue(userID, itemID);
+ super.removePreference(userID, itemID);
+ if (oldPref != null) {
+ try {
+ buildAveragesLock.writeLock().lock();
+ RunningAverage itemAverage = itemAverages.get(itemID);
+ if (itemAverage == null) {
+ throw new IllegalStateException("No preferences exist for item ID: " + itemID);
+ }
+ itemAverage.removeDatum(oldPref);
+ RunningAverage userAverage = userAverages.get(userID);
+ if (userAverage == null) {
+ throw new IllegalStateException("No preferences exist for user ID: " + userID);
+ }
+ userAverage.removeDatum(oldPref);
+ overallAveragePrefValue.removeDatum(oldPref);
+ } finally {
+ buildAveragesLock.writeLock().unlock();
+ }
+ }
+ }
+
+ @Override
+ public void refresh(Collection<Refreshable> alreadyRefreshed) {
+ refreshHelper.refresh(alreadyRefreshed);
+ }
+
+ @Override
+ public String toString() {
+ return "ItemUserAverageRecommender";
+ }
+
+ private final class Estimator implements TopItems.Estimator<Long> {
+
+ private final long userID;
+
+ private Estimator(long userID) {
+ this.userID = userID;
+ }
+
+ @Override
+ public double estimate(Long itemID) {
+ return doEstimatePreference(userID, itemID);
+ }
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/NullRescorer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/NullRescorer.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/NullRescorer.java
new file mode 100644
index 0000000..e0eda7a
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/NullRescorer.java
@@ -0,0 +1,86 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.recommender;
+
+import org.apache.mahout.cf.taste.recommender.IDRescorer;
+import org.apache.mahout.cf.taste.recommender.Rescorer;
+import org.apache.mahout.common.LongPair;
+
+/**
+ * <p>
+ * A simple {@link Rescorer} which always returns the original score.
+ * </p>
+ */
+public final class NullRescorer<T> implements Rescorer<T>, IDRescorer {
+
+ private static final IDRescorer USER_OR_ITEM_INSTANCE = new NullRescorer<>();
+ private static final Rescorer<LongPair> ITEM_ITEM_PAIR_INSTANCE = new NullRescorer<>();
+ private static final Rescorer<LongPair> USER_USER_PAIR_INSTANCE = new NullRescorer<>();
+
+ private NullRescorer() {
+ }
+
+ public static IDRescorer getItemInstance() {
+ return USER_OR_ITEM_INSTANCE;
+ }
+
+ public static IDRescorer getUserInstance() {
+ return USER_OR_ITEM_INSTANCE;
+ }
+
+ public static Rescorer<LongPair> getItemItemPairInstance() {
+ return ITEM_ITEM_PAIR_INSTANCE;
+ }
+
+ public static Rescorer<LongPair> getUserUserPairInstance() {
+ return USER_USER_PAIR_INSTANCE;
+ }
+
+ /**
+ * @param thing
+ * to rescore
+ * @param originalScore
+ * current score for item
+ * @return same originalScore as new score, always
+ */
+ @Override
+ public double rescore(T thing, double originalScore) {
+ return originalScore;
+ }
+
+ @Override
+ public boolean isFiltered(T thing) {
+ return false;
+ }
+
+ @Override
+ public double rescore(long id, double originalScore) {
+ return originalScore;
+ }
+
+ @Override
+ public boolean isFiltered(long id) {
+ return false;
+ }
+
+ @Override
+ public String toString() {
+ return "NullRescorer";
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/PreferredItemsNeighborhoodCandidateItemsStrategy.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/PreferredItemsNeighborhoodCandidateItemsStrategy.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/PreferredItemsNeighborhoodCandidateItemsStrategy.java
new file mode 100644
index 0000000..6297d0b
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/PreferredItemsNeighborhoodCandidateItemsStrategy.java
@@ -0,0 +1,48 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.recommender;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+
+public final class PreferredItemsNeighborhoodCandidateItemsStrategy extends AbstractCandidateItemsStrategy {
+
+ /**
+ * returns all items that have not been rated by the user and that were preferred by another user
+ * that has preferred at least one item that the current user has preferred too
+ */
+ @Override
+ protected FastIDSet doGetCandidateItems(long[] preferredItemIDs, DataModel dataModel, boolean includeKnownItems)
+ throws TasteException {
+ FastIDSet possibleItemsIDs = new FastIDSet();
+ for (long itemID : preferredItemIDs) {
+ PreferenceArray itemPreferences = dataModel.getPreferencesForItem(itemID);
+ int numUsersPreferringItem = itemPreferences.length();
+ for (int index = 0; index < numUsersPreferringItem; index++) {
+ possibleItemsIDs.addAll(dataModel.getItemIDsFromUser(itemPreferences.getUserID(index)));
+ }
+ }
+ if (!includeKnownItems) {
+ possibleItemsIDs.removeAll(preferredItemIDs);
+ }
+ return possibleItemsIDs;
+ }
+
+}
r***@apache.org
2018-06-28 14:54:47 UTC
Permalink
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/RandomRecommender.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/RandomRecommender.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/RandomRecommender.java
new file mode 100644
index 0000000..08aa5ae
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/RandomRecommender.java
@@ -0,0 +1,97 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.recommender;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+import java.util.Random;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.cf.taste.recommender.IDRescorer;
+import org.apache.mahout.cf.taste.recommender.RecommendedItem;
+import org.apache.mahout.common.RandomUtils;
+
+/**
+ * Produces random recommendations and preference estimates. This is likely only useful as a novelty and for
+ * benchmarking.
+ */
+public final class RandomRecommender extends AbstractRecommender {
+
+ private final Random random = RandomUtils.getRandom();
+ private final float minPref;
+ private final float maxPref;
+
+ public RandomRecommender(DataModel dataModel) throws TasteException {
+ super(dataModel);
+ float maxPref = Float.NEGATIVE_INFINITY;
+ float minPref = Float.POSITIVE_INFINITY;
+ LongPrimitiveIterator userIterator = dataModel.getUserIDs();
+ while (userIterator.hasNext()) {
+ long userID = userIterator.next();
+ PreferenceArray prefs = dataModel.getPreferencesFromUser(userID);
+ for (int i = 0; i < prefs.length(); i++) {
+ float prefValue = prefs.getValue(i);
+ if (prefValue < minPref) {
+ minPref = prefValue;
+ }
+ if (prefValue > maxPref) {
+ maxPref = prefValue;
+ }
+ }
+ }
+ this.minPref = minPref;
+ this.maxPref = maxPref;
+ }
+
+ @Override
+ public List<RecommendedItem> recommend(long userID, int howMany, IDRescorer rescorer, boolean includeKnownItems)
+ throws TasteException {
+ DataModel dataModel = getDataModel();
+ int numItems = dataModel.getNumItems();
+ List<RecommendedItem> result = new ArrayList<>(howMany);
+ while (result.size() < howMany) {
+ LongPrimitiveIterator it = dataModel.getItemIDs();
+ it.skip(random.nextInt(numItems));
+ long itemID = it.next();
+ if (includeKnownItems || dataModel.getPreferenceValue(userID, itemID) == null) {
+ result.add(new GenericRecommendedItem(itemID, randomPref()));
+ }
+ }
+ return result;
+ }
+
+ @Override
+ public float estimatePreference(long userID, long itemID) {
+ return randomPref();
+ }
+
+ private float randomPref() {
+ return minPref + random.nextFloat() * (maxPref - minPref);
+ }
+
+ @Override
+ public void refresh(Collection<Refreshable> alreadyRefreshed) {
+ getDataModel().refresh(alreadyRefreshed);
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/SamplingCandidateItemsStrategy.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/SamplingCandidateItemsStrategy.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/SamplingCandidateItemsStrategy.java
new file mode 100644
index 0000000..623a60b
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/SamplingCandidateItemsStrategy.java
@@ -0,0 +1,165 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.recommender;
+
+import com.google.common.base.Preconditions;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveArrayIterator;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+import org.apache.mahout.cf.taste.impl.common.SamplingLongPrimitiveIterator;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.Preference;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.common.iterator.FixedSizeSamplingIterator;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.Iterator;
+
+/**
+ * <p>Returns all items that have not been rated by the user <em>(3)</em> and that were preferred by another user
+ * <em>(2)</em> that has preferred at least one item <em>(1)</em> that the current user has preferred too.</p>
+ *
+ * <p>This strategy uses sampling to limit the number of items that are considered, by sampling three different
+ * things, noted above:</p>
+ *
+ * <ol>
+ * <li>The items that the user has preferred</li>
+ * <li>The users who also prefer each of those items</li>
+ * <li>The items those users also prefer</li>
+ * </ol>
+ *
+ * <p>There is a maximum associated with each of these three things; if the number of items or users exceeds
+ * that max, it is sampled so that the expected number of items or users actually used in that part of the
+ * computation is equal to the max.</p>
+ *
+ * <p>Three arguments control these three maxima. Each is a "factor" f, which establishes the max at
+ * f * log2(n), where n is the number of users or items in the data. For example if factor #2 is 5,
+ * which controls the number of users sampled per item, then 5 * log2(# users) is the maximum for this
+ * part of the computation.</p>
+ *
+ * <p>Each can be set to not do any limiting with value {@link #NO_LIMIT_FACTOR}.</p>
+ */
+public class SamplingCandidateItemsStrategy extends AbstractCandidateItemsStrategy {
+
+ private static final Logger log = LoggerFactory.getLogger(SamplingCandidateItemsStrategy.class);
+
+ /**
+ * Default factor used if not otherwise specified, for all limits. (30).
+ */
+ public static final int DEFAULT_FACTOR = 30;
+ /**
+ * Specify this value as a factor to mean no limit.
+ */
+ public static final int NO_LIMIT_FACTOR = Integer.MAX_VALUE;
+ private static final int MAX_LIMIT = Integer.MAX_VALUE;
+ private static final double LOG2 = Math.log(2.0);
+
+ private final int maxItems;
+ private final int maxUsersPerItem;
+ private final int maxItemsPerUser;
+
+ /**
+ * Defaults to using no limit ({@link #NO_LIMIT_FACTOR}) for all factors, except
+ * {@code candidatesPerUserFactor} which defaults to {@link #DEFAULT_FACTOR}.
+ *
+ * @see #SamplingCandidateItemsStrategy(int, int, int, int, int)
+ */
+ public SamplingCandidateItemsStrategy(int numUsers, int numItems) {
+ this(DEFAULT_FACTOR, DEFAULT_FACTOR, DEFAULT_FACTOR, numUsers, numItems);
+ }
+
+ /**
+ * @param itemsFactor factor controlling max items considered for a user
+ * @param usersPerItemFactor factor controlling max users considered for each of those items
+ * @param candidatesPerUserFactor factor controlling max candidate items considered from each of those users
+ * @param numUsers number of users currently in the data
+ * @param numItems number of items in the data
+ */
+ public SamplingCandidateItemsStrategy(int itemsFactor,
+ int usersPerItemFactor,
+ int candidatesPerUserFactor,
+ int numUsers,
+ int numItems) {
+ Preconditions.checkArgument(itemsFactor > 0, "itemsFactor must be greater then 0!");
+ Preconditions.checkArgument(usersPerItemFactor > 0, "usersPerItemFactor must be greater then 0!");
+ Preconditions.checkArgument(candidatesPerUserFactor > 0, "candidatesPerUserFactor must be greater then 0!");
+ Preconditions.checkArgument(numUsers > 0, "numUsers must be greater then 0!");
+ Preconditions.checkArgument(numItems > 0, "numItems must be greater then 0!");
+ maxItems = computeMaxFrom(itemsFactor, numItems);
+ maxUsersPerItem = computeMaxFrom(usersPerItemFactor, numUsers);
+ maxItemsPerUser = computeMaxFrom(candidatesPerUserFactor, numItems);
+ log.debug("maxItems {}, maxUsersPerItem {}, maxItemsPerUser {}", maxItems, maxUsersPerItem, maxItemsPerUser);
+ }
+
+ private static int computeMaxFrom(int factor, int numThings) {
+ if (factor == NO_LIMIT_FACTOR) {
+ return MAX_LIMIT;
+ }
+ long max = (long) (factor * (1.0 + Math.log(numThings) / LOG2));
+ return max > MAX_LIMIT ? MAX_LIMIT : (int) max;
+ }
+
+ @Override
+ protected FastIDSet doGetCandidateItems(long[] preferredItemIDs, DataModel dataModel, boolean includeKnownItems)
+ throws TasteException {
+ LongPrimitiveIterator preferredItemIDsIterator = new LongPrimitiveArrayIterator(preferredItemIDs);
+ if (preferredItemIDs.length > maxItems) {
+ double samplingRate = (double) maxItems / preferredItemIDs.length;
+// log.info("preferredItemIDs.length {}, samplingRate {}", preferredItemIDs.length, samplingRate);
+ preferredItemIDsIterator =
+ new SamplingLongPrimitiveIterator(preferredItemIDsIterator, samplingRate);
+ }
+ FastIDSet possibleItemsIDs = new FastIDSet();
+ while (preferredItemIDsIterator.hasNext()) {
+ long itemID = preferredItemIDsIterator.nextLong();
+ PreferenceArray prefs = dataModel.getPreferencesForItem(itemID);
+ int prefsLength = prefs.length();
+ if (prefsLength > maxUsersPerItem) {
+ Iterator<Preference> sampledPrefs =
+ new FixedSizeSamplingIterator<>(maxUsersPerItem, prefs.iterator());
+ while (sampledPrefs.hasNext()) {
+ addSomeOf(possibleItemsIDs, dataModel.getItemIDsFromUser(sampledPrefs.next().getUserID()));
+ }
+ } else {
+ for (int i = 0; i < prefsLength; i++) {
+ addSomeOf(possibleItemsIDs, dataModel.getItemIDsFromUser(prefs.getUserID(i)));
+ }
+ }
+ }
+ if (!includeKnownItems) {
+ possibleItemsIDs.removeAll(preferredItemIDs);
+ }
+ return possibleItemsIDs;
+ }
+
+ private void addSomeOf(FastIDSet possibleItemIDs, FastIDSet itemIDs) {
+ if (itemIDs.size() > maxItemsPerUser) {
+ LongPrimitiveIterator it =
+ new SamplingLongPrimitiveIterator(itemIDs.iterator(), (double) maxItemsPerUser / itemIDs.size());
+ while (it.hasNext()) {
+ possibleItemIDs.add(it.nextLong());
+ }
+ } else {
+ possibleItemIDs.addAll(itemIDs);
+ }
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/SimilarUser.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/SimilarUser.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/SimilarUser.java
new file mode 100644
index 0000000..c6d417f
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/SimilarUser.java
@@ -0,0 +1,80 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.recommender;
+
+import org.apache.mahout.common.RandomUtils;
+
+/** Simply encapsulates a user and a similarity value. */
+public final class SimilarUser implements Comparable<SimilarUser> {
+
+ private final long userID;
+ private final double similarity;
+
+ public SimilarUser(long userID, double similarity) {
+ this.userID = userID;
+ this.similarity = similarity;
+ }
+
+ long getUserID() {
+ return userID;
+ }
+
+ double getSimilarity() {
+ return similarity;
+ }
+
+ @Override
+ public int hashCode() {
+ return (int) userID ^ RandomUtils.hashDouble(similarity);
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (!(o instanceof SimilarUser)) {
+ return false;
+ }
+ SimilarUser other = (SimilarUser) o;
+ return userID == other.getUserID() && similarity == other.getSimilarity();
+ }
+
+ @Override
+ public String toString() {
+ return "SimilarUser[user:" + userID + ", similarity:" + similarity + ']';
+ }
+
+ /** Defines an ordering from most similar to least similar. */
+ @Override
+ public int compareTo(SimilarUser other) {
+ double otherSimilarity = other.getSimilarity();
+ if (similarity > otherSimilarity) {
+ return -1;
+ }
+ if (similarity < otherSimilarity) {
+ return 1;
+ }
+ long otherUserID = other.getUserID();
+ if (userID < otherUserID) {
+ return -1;
+ }
+ if (userID > otherUserID) {
+ return 1;
+ }
+ return 0;
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/TopItems.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/TopItems.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/TopItems.java
new file mode 100644
index 0000000..f7b4385
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/TopItems.java
@@ -0,0 +1,211 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.recommender;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+import java.util.PriorityQueue;
+import java.util.Queue;
+
+import com.google.common.base.Preconditions;
+import org.apache.mahout.cf.taste.common.NoSuchItemException;
+import org.apache.mahout.cf.taste.common.NoSuchUserException;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+import org.apache.mahout.cf.taste.impl.similarity.GenericItemSimilarity;
+import org.apache.mahout.cf.taste.impl.similarity.GenericUserSimilarity;
+import org.apache.mahout.cf.taste.recommender.IDRescorer;
+import org.apache.mahout.cf.taste.recommender.RecommendedItem;
+
+/**
+ * <p>
+ * A simple class that refactors the "find top N things" logic that is used in several places.
+ * </p>
+ */
+public final class TopItems {
+
+ private static final long[] NO_IDS = new long[0];
+
+ private TopItems() { }
+
+ public static List<RecommendedItem> getTopItems(int howMany,
+ LongPrimitiveIterator possibleItemIDs,
+ IDRescorer rescorer,
+ Estimator<Long> estimator) throws TasteException {
+ Preconditions.checkArgument(possibleItemIDs != null, "possibleItemIDs is null");
+ Preconditions.checkArgument(estimator != null, "estimator is null");
+
+ Queue<RecommendedItem> topItems = new PriorityQueue<>(howMany + 1,
+ Collections.reverseOrder(ByValueRecommendedItemComparator.getInstance()));
+ boolean full = false;
+ double lowestTopValue = Double.NEGATIVE_INFINITY;
+ while (possibleItemIDs.hasNext()) {
+ long itemID = possibleItemIDs.next();
+ if (rescorer == null || !rescorer.isFiltered(itemID)) {
+ double preference;
+ try {
+ preference = estimator.estimate(itemID);
+ } catch (NoSuchItemException nsie) {
+ continue;
+ }
+ double rescoredPref = rescorer == null ? preference : rescorer.rescore(itemID, preference);
+ if (!Double.isNaN(rescoredPref) && (!full || rescoredPref > lowestTopValue)) {
+ topItems.add(new GenericRecommendedItem(itemID, (float) rescoredPref));
+ if (full) {
+ topItems.poll();
+ } else if (topItems.size() > howMany) {
+ full = true;
+ topItems.poll();
+ }
+ lowestTopValue = topItems.peek().getValue();
+ }
+ }
+ }
+ int size = topItems.size();
+ if (size == 0) {
+ return Collections.emptyList();
+ }
+ List<RecommendedItem> result = new ArrayList<>(size);
+ result.addAll(topItems);
+ Collections.sort(result, ByValueRecommendedItemComparator.getInstance());
+ return result;
+ }
+
+ public static long[] getTopUsers(int howMany,
+ LongPrimitiveIterator allUserIDs,
+ IDRescorer rescorer,
+ Estimator<Long> estimator) throws TasteException {
+ Queue<SimilarUser> topUsers = new PriorityQueue<>(howMany + 1, Collections.reverseOrder());
+ boolean full = false;
+ double lowestTopValue = Double.NEGATIVE_INFINITY;
+ while (allUserIDs.hasNext()) {
+ long userID = allUserIDs.next();
+ if (rescorer != null && rescorer.isFiltered(userID)) {
+ continue;
+ }
+ double similarity;
+ try {
+ similarity = estimator.estimate(userID);
+ } catch (NoSuchUserException nsue) {
+ continue;
+ }
+ double rescoredSimilarity = rescorer == null ? similarity : rescorer.rescore(userID, similarity);
+ if (!Double.isNaN(rescoredSimilarity) && (!full || rescoredSimilarity > lowestTopValue)) {
+ topUsers.add(new SimilarUser(userID, rescoredSimilarity));
+ if (full) {
+ topUsers.poll();
+ } else if (topUsers.size() > howMany) {
+ full = true;
+ topUsers.poll();
+ }
+ lowestTopValue = topUsers.peek().getSimilarity();
+ }
+ }
+ int size = topUsers.size();
+ if (size == 0) {
+ return NO_IDS;
+ }
+ List<SimilarUser> sorted = new ArrayList<>(size);
+ sorted.addAll(topUsers);
+ Collections.sort(sorted);
+ long[] result = new long[size];
+ int i = 0;
+ for (SimilarUser similarUser : sorted) {
+ result[i++] = similarUser.getUserID();
+ }
+ return result;
+ }
+
+ /**
+ * <p>
+ * Thanks to tsmorton for suggesting this functionality and writing part of the code.
+ * </p>
+ *
+ * @see GenericItemSimilarity#GenericItemSimilarity(Iterable, int)
+ * @see GenericItemSimilarity#GenericItemSimilarity(org.apache.mahout.cf.taste.similarity.ItemSimilarity,
+ * org.apache.mahout.cf.taste.model.DataModel, int)
+ */
+ public static List<GenericItemSimilarity.ItemItemSimilarity> getTopItemItemSimilarities(
+ int howMany, Iterator<GenericItemSimilarity.ItemItemSimilarity> allSimilarities) {
+
+ Queue<GenericItemSimilarity.ItemItemSimilarity> topSimilarities
+ = new PriorityQueue<>(howMany + 1, Collections.reverseOrder());
+ boolean full = false;
+ double lowestTopValue = Double.NEGATIVE_INFINITY;
+ while (allSimilarities.hasNext()) {
+ GenericItemSimilarity.ItemItemSimilarity similarity = allSimilarities.next();
+ double value = similarity.getValue();
+ if (!Double.isNaN(value) && (!full || value > lowestTopValue)) {
+ topSimilarities.add(similarity);
+ if (full) {
+ topSimilarities.poll();
+ } else if (topSimilarities.size() > howMany) {
+ full = true;
+ topSimilarities.poll();
+ }
+ lowestTopValue = topSimilarities.peek().getValue();
+ }
+ }
+ int size = topSimilarities.size();
+ if (size == 0) {
+ return Collections.emptyList();
+ }
+ List<GenericItemSimilarity.ItemItemSimilarity> result = new ArrayList<>(size);
+ result.addAll(topSimilarities);
+ Collections.sort(result);
+ return result;
+ }
+
+ public static List<GenericUserSimilarity.UserUserSimilarity> getTopUserUserSimilarities(
+ int howMany, Iterator<GenericUserSimilarity.UserUserSimilarity> allSimilarities) {
+
+ Queue<GenericUserSimilarity.UserUserSimilarity> topSimilarities
+ = new PriorityQueue<>(howMany + 1, Collections.reverseOrder());
+ boolean full = false;
+ double lowestTopValue = Double.NEGATIVE_INFINITY;
+ while (allSimilarities.hasNext()) {
+ GenericUserSimilarity.UserUserSimilarity similarity = allSimilarities.next();
+ double value = similarity.getValue();
+ if (!Double.isNaN(value) && (!full || value > lowestTopValue)) {
+ topSimilarities.add(similarity);
+ if (full) {
+ topSimilarities.poll();
+ } else if (topSimilarities.size() > howMany) {
+ full = true;
+ topSimilarities.poll();
+ }
+ lowestTopValue = topSimilarities.peek().getValue();
+ }
+ }
+ int size = topSimilarities.size();
+ if (size == 0) {
+ return Collections.emptyList();
+ }
+ List<GenericUserSimilarity.UserUserSimilarity> result = new ArrayList<>(size);
+ result.addAll(topSimilarities);
+ Collections.sort(result);
+ return result;
+ }
+
+ public interface Estimator<T> {
+ double estimate(T thing) throws TasteException;
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/ALSWRFactorizer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/ALSWRFactorizer.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/ALSWRFactorizer.java
new file mode 100644
index 0000000..0ba5139
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/ALSWRFactorizer.java
@@ -0,0 +1,312 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.recommender.svd;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Random;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.TimeUnit;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FullRunningAverage;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+import org.apache.mahout.cf.taste.impl.common.RunningAverage;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.Preference;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.SequentialAccessSparseVector;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.als.AlternatingLeastSquaresSolver;
+import org.apache.mahout.math.als.ImplicitFeedbackAlternatingLeastSquaresSolver;
+import org.apache.mahout.math.map.OpenIntObjectHashMap;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * factorizes the rating matrix using "Alternating-Least-Squares with Weighted-λ-Regularization" as described in
+ * <a href="http://www.hpl.hp.com/personal/Robert_Schreiber/papers/2008%20AAIM%20Netflix/netflix_aaim08(submitted).pdf">
+ * "Large-scale Collaborative Filtering for the Netflix Prize"</a>
+ *
+ * also supports the implicit feedback variant of this approach as described in "Collaborative Filtering for Implicit
+ * Feedback Datasets" available at http://research.yahoo.com/pub/2433
+ */
+public class ALSWRFactorizer extends AbstractFactorizer {
+
+ private final DataModel dataModel;
+
+ /** number of features used to compute this factorization */
+ private final int numFeatures;
+ /** parameter to control the regularization */
+ private final double lambda;
+ /** number of iterations */
+ private final int numIterations;
+
+ private final boolean usesImplicitFeedback;
+ /** confidence weighting parameter, only necessary when working with implicit feedback */
+ private final double alpha;
+
+ private final int numTrainingThreads;
+
+ private static final double DEFAULT_ALPHA = 40;
+
+ private static final Logger log = LoggerFactory.getLogger(ALSWRFactorizer.class);
+
+ public ALSWRFactorizer(DataModel dataModel, int numFeatures, double lambda, int numIterations,
+ boolean usesImplicitFeedback, double alpha, int numTrainingThreads) throws TasteException {
+ super(dataModel);
+ this.dataModel = dataModel;
+ this.numFeatures = numFeatures;
+ this.lambda = lambda;
+ this.numIterations = numIterations;
+ this.usesImplicitFeedback = usesImplicitFeedback;
+ this.alpha = alpha;
+ this.numTrainingThreads = numTrainingThreads;
+ }
+
+ public ALSWRFactorizer(DataModel dataModel, int numFeatures, double lambda, int numIterations,
+ boolean usesImplicitFeedback, double alpha) throws TasteException {
+ this(dataModel, numFeatures, lambda, numIterations, usesImplicitFeedback, alpha,
+ Runtime.getRuntime().availableProcessors());
+ }
+
+ public ALSWRFactorizer(DataModel dataModel, int numFeatures, double lambda, int numIterations) throws TasteException {
+ this(dataModel, numFeatures, lambda, numIterations, false, DEFAULT_ALPHA);
+ }
+
+ static class Features {
+
+ private final DataModel dataModel;
+ private final int numFeatures;
+
+ private final double[][] M;
+ private final double[][] U;
+
+ Features(ALSWRFactorizer factorizer) throws TasteException {
+ dataModel = factorizer.dataModel;
+ numFeatures = factorizer.numFeatures;
+ Random random = RandomUtils.getRandom();
+ M = new double[dataModel.getNumItems()][numFeatures];
+ LongPrimitiveIterator itemIDsIterator = dataModel.getItemIDs();
+ while (itemIDsIterator.hasNext()) {
+ long itemID = itemIDsIterator.nextLong();
+ int itemIDIndex = factorizer.itemIndex(itemID);
+ M[itemIDIndex][0] = averateRating(itemID);
+ for (int feature = 1; feature < numFeatures; feature++) {
+ M[itemIDIndex][feature] = random.nextDouble() * 0.1;
+ }
+ }
+ U = new double[dataModel.getNumUsers()][numFeatures];
+ }
+
+ double[][] getM() {
+ return M;
+ }
+
+ double[][] getU() {
+ return U;
+ }
+
+ Vector getUserFeatureColumn(int index) {
+ return new DenseVector(U[index]);
+ }
+
+ Vector getItemFeatureColumn(int index) {
+ return new DenseVector(M[index]);
+ }
+
+ void setFeatureColumnInU(int idIndex, Vector vector) {
+ setFeatureColumn(U, idIndex, vector);
+ }
+
+ void setFeatureColumnInM(int idIndex, Vector vector) {
+ setFeatureColumn(M, idIndex, vector);
+ }
+
+ protected void setFeatureColumn(double[][] matrix, int idIndex, Vector vector) {
+ for (int feature = 0; feature < numFeatures; feature++) {
+ matrix[idIndex][feature] = vector.get(feature);
+ }
+ }
+
+ protected double averateRating(long itemID) throws TasteException {
+ PreferenceArray prefs = dataModel.getPreferencesForItem(itemID);
+ RunningAverage avg = new FullRunningAverage();
+ for (Preference pref : prefs) {
+ avg.addDatum(pref.getValue());
+ }
+ return avg.getAverage();
+ }
+ }
+
+ @Override
+ public Factorization factorize() throws TasteException {
+ log.info("starting to compute the factorization...");
+ final Features features = new Features(this);
+
+ /* feature maps necessary for solving for implicit feedback */
+ OpenIntObjectHashMap<Vector> userY = null;
+ OpenIntObjectHashMap<Vector> itemY = null;
+
+ if (usesImplicitFeedback) {
+ userY = userFeaturesMapping(dataModel.getUserIDs(), dataModel.getNumUsers(), features.getU());
+ itemY = itemFeaturesMapping(dataModel.getItemIDs(), dataModel.getNumItems(), features.getM());
+ }
+
+ for (int iteration = 0; iteration < numIterations; iteration++) {
+ log.info("iteration {}", iteration);
+
+ /* fix M - compute U */
+ ExecutorService queue = createQueue();
+ LongPrimitiveIterator userIDsIterator = dataModel.getUserIDs();
+ try {
+
+ final ImplicitFeedbackAlternatingLeastSquaresSolver implicitFeedbackSolver = usesImplicitFeedback
+ ? new ImplicitFeedbackAlternatingLeastSquaresSolver(numFeatures, lambda, alpha, itemY, numTrainingThreads)
+ : null;
+
+ while (userIDsIterator.hasNext()) {
+ final long userID = userIDsIterator.nextLong();
+ final LongPrimitiveIterator itemIDsFromUser = dataModel.getItemIDsFromUser(userID).iterator();
+ final PreferenceArray userPrefs = dataModel.getPreferencesFromUser(userID);
+ queue.execute(new Runnable() {
+ @Override
+ public void run() {
+ List<Vector> featureVectors = new ArrayList<>();
+ while (itemIDsFromUser.hasNext()) {
+ long itemID = itemIDsFromUser.nextLong();
+ featureVectors.add(features.getItemFeatureColumn(itemIndex(itemID)));
+ }
+
+ Vector userFeatures = usesImplicitFeedback
+ ? implicitFeedbackSolver.solve(sparseUserRatingVector(userPrefs))
+ : AlternatingLeastSquaresSolver.solve(featureVectors, ratingVector(userPrefs), lambda, numFeatures);
+
+ features.setFeatureColumnInU(userIndex(userID), userFeatures);
+ }
+ });
+ }
+ } finally {
+ queue.shutdown();
+ try {
+ queue.awaitTermination(dataModel.getNumUsers(), TimeUnit.SECONDS);
+ } catch (InterruptedException e) {
+ log.warn("Error when computing user features", e);
+ }
+ }
+
+ /* fix U - compute M */
+ queue = createQueue();
+ LongPrimitiveIterator itemIDsIterator = dataModel.getItemIDs();
+ try {
+
+ final ImplicitFeedbackAlternatingLeastSquaresSolver implicitFeedbackSolver = usesImplicitFeedback
+ ? new ImplicitFeedbackAlternatingLeastSquaresSolver(numFeatures, lambda, alpha, userY, numTrainingThreads)
+ : null;
+
+ while (itemIDsIterator.hasNext()) {
+ final long itemID = itemIDsIterator.nextLong();
+ final PreferenceArray itemPrefs = dataModel.getPreferencesForItem(itemID);
+ queue.execute(new Runnable() {
+ @Override
+ public void run() {
+ List<Vector> featureVectors = new ArrayList<>();
+ for (Preference pref : itemPrefs) {
+ long userID = pref.getUserID();
+ featureVectors.add(features.getUserFeatureColumn(userIndex(userID)));
+ }
+
+ Vector itemFeatures = usesImplicitFeedback
+ ? implicitFeedbackSolver.solve(sparseItemRatingVector(itemPrefs))
+ : AlternatingLeastSquaresSolver.solve(featureVectors, ratingVector(itemPrefs), lambda, numFeatures);
+
+ features.setFeatureColumnInM(itemIndex(itemID), itemFeatures);
+ }
+ });
+ }
+ } finally {
+ queue.shutdown();
+ try {
+ queue.awaitTermination(dataModel.getNumItems(), TimeUnit.SECONDS);
+ } catch (InterruptedException e) {
+ log.warn("Error when computing item features", e);
+ }
+ }
+ }
+
+ log.info("finished computation of the factorization...");
+ return createFactorization(features.getU(), features.getM());
+ }
+
+ protected ExecutorService createQueue() {
+ return Executors.newFixedThreadPool(numTrainingThreads);
+ }
+
+ protected static Vector ratingVector(PreferenceArray prefs) {
+ double[] ratings = new double[prefs.length()];
+ for (int n = 0; n < prefs.length(); n++) {
+ ratings[n] = prefs.get(n).getValue();
+ }
+ return new DenseVector(ratings, true);
+ }
+
+ //TODO find a way to get rid of the object overhead here
+ protected OpenIntObjectHashMap<Vector> itemFeaturesMapping(LongPrimitiveIterator itemIDs, int numItems,
+ double[][] featureMatrix) {
+ OpenIntObjectHashMap<Vector> mapping = new OpenIntObjectHashMap<>(numItems);
+ while (itemIDs.hasNext()) {
+ long itemID = itemIDs.next();
+ int itemIndex = itemIndex(itemID);
+ mapping.put(itemIndex, new DenseVector(featureMatrix[itemIndex(itemID)], true));
+ }
+
+ return mapping;
+ }
+
+ protected OpenIntObjectHashMap<Vector> userFeaturesMapping(LongPrimitiveIterator userIDs, int numUsers,
+ double[][] featureMatrix) {
+ OpenIntObjectHashMap<Vector> mapping = new OpenIntObjectHashMap<>(numUsers);
+
+ while (userIDs.hasNext()) {
+ long userID = userIDs.next();
+ int userIndex = userIndex(userID);
+ mapping.put(userIndex, new DenseVector(featureMatrix[userIndex(userID)], true));
+ }
+
+ return mapping;
+ }
+
+ protected Vector sparseItemRatingVector(PreferenceArray prefs) {
+ SequentialAccessSparseVector ratings = new SequentialAccessSparseVector(Integer.MAX_VALUE, prefs.length());
+ for (Preference preference : prefs) {
+ ratings.set(userIndex(preference.getUserID()), preference.getValue());
+ }
+ return ratings;
+ }
+
+ protected Vector sparseUserRatingVector(PreferenceArray prefs) {
+ SequentialAccessSparseVector ratings = new SequentialAccessSparseVector(Integer.MAX_VALUE, prefs.length());
+ for (Preference preference : prefs) {
+ ratings.set(itemIndex(preference.getItemID()), preference.getValue());
+ }
+ return ratings;
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/AbstractFactorizer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/AbstractFactorizer.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/AbstractFactorizer.java
new file mode 100644
index 0000000..0a39a1d
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/AbstractFactorizer.java
@@ -0,0 +1,94 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.recommender.svd;
+
+import java.util.Collection;
+import java.util.concurrent.Callable;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+import org.apache.mahout.cf.taste.impl.common.RefreshHelper;
+import org.apache.mahout.cf.taste.model.DataModel;
+
+/**
+ * base class for {@link Factorizer}s, provides ID to index mapping
+ */
+public abstract class AbstractFactorizer implements Factorizer {
+
+ private final DataModel dataModel;
+ private FastByIDMap<Integer> userIDMapping;
+ private FastByIDMap<Integer> itemIDMapping;
+ private final RefreshHelper refreshHelper;
+
+ protected AbstractFactorizer(DataModel dataModel) throws TasteException {
+ this.dataModel = dataModel;
+ buildMappings();
+ refreshHelper = new RefreshHelper(new Callable<Object>() {
+ @Override
+ public Object call() throws TasteException {
+ buildMappings();
+ return null;
+ }
+ });
+ refreshHelper.addDependency(dataModel);
+ }
+
+ private void buildMappings() throws TasteException {
+ userIDMapping = createIDMapping(dataModel.getNumUsers(), dataModel.getUserIDs());
+ itemIDMapping = createIDMapping(dataModel.getNumItems(), dataModel.getItemIDs());
+ }
+
+ protected Factorization createFactorization(double[][] userFeatures, double[][] itemFeatures) {
+ return new Factorization(userIDMapping, itemIDMapping, userFeatures, itemFeatures);
+ }
+
+ protected Integer userIndex(long userID) {
+ Integer userIndex = userIDMapping.get(userID);
+ if (userIndex == null) {
+ userIndex = userIDMapping.size();
+ userIDMapping.put(userID, userIndex);
+ }
+ return userIndex;
+ }
+
+ protected Integer itemIndex(long itemID) {
+ Integer itemIndex = itemIDMapping.get(itemID);
+ if (itemIndex == null) {
+ itemIndex = itemIDMapping.size();
+ itemIDMapping.put(itemID, itemIndex);
+ }
+ return itemIndex;
+ }
+
+ private static FastByIDMap<Integer> createIDMapping(int size, LongPrimitiveIterator idIterator) {
+ FastByIDMap<Integer> mapping = new FastByIDMap<>(size);
+ int index = 0;
+ while (idIterator.hasNext()) {
+ mapping.put(idIterator.nextLong(), index++);
+ }
+ return mapping;
+ }
+
+ @Override
+ public void refresh(Collection<Refreshable> alreadyRefreshed) {
+ refreshHelper.refresh(alreadyRefreshed);
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/Factorization.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/Factorization.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/Factorization.java
new file mode 100644
index 0000000..f169a60
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/Factorization.java
@@ -0,0 +1,137 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.recommender.svd;
+
+import java.util.Arrays;
+import java.util.Map;
+
+import com.google.common.base.Preconditions;
+import org.apache.mahout.cf.taste.common.NoSuchItemException;
+import org.apache.mahout.cf.taste.common.NoSuchUserException;
+import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+
+/**
+ * a factorization of the rating matrix
+ */
+public class Factorization {
+
+ /** used to find the rows in the user features matrix by userID */
+ private final FastByIDMap<Integer> userIDMapping;
+ /** used to find the rows in the item features matrix by itemID */
+ private final FastByIDMap<Integer> itemIDMapping;
+
+ /** user features matrix */
+ private final double[][] userFeatures;
+ /** item features matrix */
+ private final double[][] itemFeatures;
+
+ public Factorization(FastByIDMap<Integer> userIDMapping, FastByIDMap<Integer> itemIDMapping, double[][] userFeatures,
+ double[][] itemFeatures) {
+ this.userIDMapping = Preconditions.checkNotNull(userIDMapping);
+ this.itemIDMapping = Preconditions.checkNotNull(itemIDMapping);
+ this.userFeatures = userFeatures;
+ this.itemFeatures = itemFeatures;
+ }
+
+ public double[][] allUserFeatures() {
+ return userFeatures;
+ }
+
+ public double[] getUserFeatures(long userID) throws NoSuchUserException {
+ Integer index = userIDMapping.get(userID);
+ if (index == null) {
+ throw new NoSuchUserException(userID);
+ }
+ return userFeatures[index];
+ }
+
+ public double[][] allItemFeatures() {
+ return itemFeatures;
+ }
+
+ public double[] getItemFeatures(long itemID) throws NoSuchItemException {
+ Integer index = itemIDMapping.get(itemID);
+ if (index == null) {
+ throw new NoSuchItemException(itemID);
+ }
+ return itemFeatures[index];
+ }
+
+ public int userIndex(long userID) throws NoSuchUserException {
+ Integer index = userIDMapping.get(userID);
+ if (index == null) {
+ throw new NoSuchUserException(userID);
+ }
+ return index;
+ }
+
+ public Iterable<Map.Entry<Long,Integer>> getUserIDMappings() {
+ return userIDMapping.entrySet();
+ }
+
+ public LongPrimitiveIterator getUserIDMappingKeys() {
+ return userIDMapping.keySetIterator();
+ }
+
+ public int itemIndex(long itemID) throws NoSuchItemException {
+ Integer index = itemIDMapping.get(itemID);
+ if (index == null) {
+ throw new NoSuchItemException(itemID);
+ }
+ return index;
+ }
+
+ public Iterable<Map.Entry<Long,Integer>> getItemIDMappings() {
+ return itemIDMapping.entrySet();
+ }
+
+ public LongPrimitiveIterator getItemIDMappingKeys() {
+ return itemIDMapping.keySetIterator();
+ }
+
+ public int numFeatures() {
+ return userFeatures.length > 0 ? userFeatures[0].length : 0;
+ }
+
+ public int numUsers() {
+ return userIDMapping.size();
+ }
+
+ public int numItems() {
+ return itemIDMapping.size();
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (o instanceof Factorization) {
+ Factorization other = (Factorization) o;
+ return userIDMapping.equals(other.userIDMapping) && itemIDMapping.equals(other.itemIDMapping)
+ && Arrays.deepEquals(userFeatures, other.userFeatures) && Arrays.deepEquals(itemFeatures, other.itemFeatures);
+ }
+ return false;
+ }
+
+ @Override
+ public int hashCode() {
+ int hashCode = 31 * userIDMapping.hashCode() + itemIDMapping.hashCode();
+ hashCode = 31 * hashCode + Arrays.deepHashCode(userFeatures);
+ hashCode = 31 * hashCode + Arrays.deepHashCode(itemFeatures);
+ return hashCode;
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/Factorizer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/Factorizer.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/Factorizer.java
new file mode 100644
index 0000000..2cabe73
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/Factorizer.java
@@ -0,0 +1,30 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.recommender.svd;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+
+/**
+ * Implementation must be able to create a factorization of a rating matrix
+ */
+public interface Factorizer extends Refreshable {
+
+ Factorization factorize() throws TasteException;
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/FilePersistenceStrategy.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/FilePersistenceStrategy.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/FilePersistenceStrategy.java
new file mode 100644
index 0000000..08c038a
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/FilePersistenceStrategy.java
@@ -0,0 +1,139 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.recommender.svd;
+
+import java.io.BufferedInputStream;
+import java.io.BufferedOutputStream;
+import java.io.DataInput;
+import java.io.DataInputStream;
+import java.io.DataOutput;
+import java.io.DataOutputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.util.Map;
+
+import com.google.common.base.Preconditions;
+import org.apache.mahout.cf.taste.common.NoSuchItemException;
+import org.apache.mahout.cf.taste.common.NoSuchUserException;
+import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/** Provides a file-based persistent store. */
+public class FilePersistenceStrategy implements PersistenceStrategy {
+
+ private final File file;
+
+ private static final Logger log = LoggerFactory.getLogger(FilePersistenceStrategy.class);
+
+ /**
+ * @param file the file to use for storage. If the file does not exist it will be created when required.
+ */
+ public FilePersistenceStrategy(File file) {
+ this.file = Preconditions.checkNotNull(file);
+ }
+
+ @Override
+ public Factorization load() throws IOException {
+ if (!file.exists()) {
+ log.info("{} does not yet exist, no factorization found", file.getAbsolutePath());
+ return null;
+ }
+ try (DataInputStream in = new DataInputStream(new BufferedInputStream(new FileInputStream(file)))){
+ log.info("Reading factorization from {}...", file.getAbsolutePath());
+ return readBinary(in);
+ }
+ }
+
+ @Override
+ public void maybePersist(Factorization factorization) throws IOException {
+ try (DataOutputStream out = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(file)))){
+ log.info("Writing factorization to {}...", file.getAbsolutePath());
+ writeBinary(factorization, out);
+ }
+ }
+
+ protected static void writeBinary(Factorization factorization, DataOutput out) throws IOException {
+ out.writeInt(factorization.numFeatures());
+ out.writeInt(factorization.numUsers());
+ out.writeInt(factorization.numItems());
+
+ for (Map.Entry<Long,Integer> mappingEntry : factorization.getUserIDMappings()) {
+ long userID = mappingEntry.getKey();
+ out.writeInt(mappingEntry.getValue());
+ out.writeLong(userID);
+ try {
+ double[] userFeatures = factorization.getUserFeatures(userID);
+ for (int feature = 0; feature < factorization.numFeatures(); feature++) {
+ out.writeDouble(userFeatures[feature]);
+ }
+ } catch (NoSuchUserException e) {
+ throw new IOException("Unable to persist factorization", e);
+ }
+ }
+
+ for (Map.Entry<Long,Integer> entry : factorization.getItemIDMappings()) {
+ long itemID = entry.getKey();
+ out.writeInt(entry.getValue());
+ out.writeLong(itemID);
+ try {
+ double[] itemFeatures = factorization.getItemFeatures(itemID);
+ for (int feature = 0; feature < factorization.numFeatures(); feature++) {
+ out.writeDouble(itemFeatures[feature]);
+ }
+ } catch (NoSuchItemException e) {
+ throw new IOException("Unable to persist factorization", e);
+ }
+ }
+ }
+
+ public static Factorization readBinary(DataInput in) throws IOException {
+ int numFeatures = in.readInt();
+ int numUsers = in.readInt();
+ int numItems = in.readInt();
+
+ FastByIDMap<Integer> userIDMapping = new FastByIDMap<>(numUsers);
+ double[][] userFeatures = new double[numUsers][numFeatures];
+
+ for (int n = 0; n < numUsers; n++) {
+ int userIndex = in.readInt();
+ long userID = in.readLong();
+ userIDMapping.put(userID, userIndex);
+ for (int feature = 0; feature < numFeatures; feature++) {
+ userFeatures[userIndex][feature] = in.readDouble();
+ }
+ }
+
+ FastByIDMap<Integer> itemIDMapping = new FastByIDMap<>(numItems);
+ double[][] itemFeatures = new double[numItems][numFeatures];
+
+ for (int n = 0; n < numItems; n++) {
+ int itemIndex = in.readInt();
+ long itemID = in.readLong();
+ itemIDMapping.put(itemID, itemIndex);
+ for (int feature = 0; feature < numFeatures; feature++) {
+ itemFeatures[itemIndex][feature] = in.readDouble();
+ }
+ }
+
+ return new Factorization(userIDMapping, itemIDMapping, userFeatures, itemFeatures);
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/NoPersistenceStrategy.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/NoPersistenceStrategy.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/NoPersistenceStrategy.java
new file mode 100644
index 0000000..0d1aab0
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/NoPersistenceStrategy.java
@@ -0,0 +1,37 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.recommender.svd;
+
+import java.io.IOException;
+
+/**
+ * A {@link PersistenceStrategy} which does nothing.
+ */
+public class NoPersistenceStrategy implements PersistenceStrategy {
+
+ @Override
+ public Factorization load() throws IOException {
+ return null;
+ }
+
+ @Override
+ public void maybePersist(Factorization factorization) throws IOException {
+ // do nothing.
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/ParallelSGDFactorizer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/ParallelSGDFactorizer.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/ParallelSGDFactorizer.java
new file mode 100644
index 0000000..8a6a702
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/ParallelSGDFactorizer.java
@@ -0,0 +1,340 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.recommender.svd;
+
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.TimeUnit;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FullRunningAverage;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+import org.apache.mahout.cf.taste.impl.common.RunningAverage;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.Preference;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.common.RandomWrapper;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/** Minimalistic implementation of Parallel SGD factorizer based on
+ * <a href="http://www.sze.hu/~gtakacs/download/jmlr_2009.pdf">
+ * "Scalable Collaborative Filtering Approaches for Large Recommender Systems"</a>
+ * and
+ * <a href="hwww.cs.wisc.edu/~brecht/papers/hogwildTR.pdf">
+ * "Hogwild!: A Lock-Free Approach to Parallelizing Stochastic Gradient Descent"</a> */
+public class ParallelSGDFactorizer extends AbstractFactorizer {
+
+ private final DataModel dataModel;
+ /** Parameter used to prevent overfitting. */
+ private final double lambda;
+ /** Number of features used to compute this factorization */
+ private final int rank;
+ /** Number of iterations */
+ private final int numEpochs;
+
+ private int numThreads;
+
+ // these next two control decayFactor^steps exponential type of annealing learning rate and decay factor
+ private double mu0 = 0.01;
+ private double decayFactor = 1;
+ // these next two control 1/steps^forget type annealing
+ private int stepOffset = 0;
+ // -1 equals even weighting of all examples, 0 means only use exponential annealing
+ private double forgettingExponent = 0;
+
+ // The following two should be inversely proportional :)
+ private double biasMuRatio = 0.5;
+ private double biasLambdaRatio = 0.1;
+
+ /** TODO: this is not safe as += is not atomic on many processors, can be replaced with AtomicDoubleArray
+ * but it works just fine right now */
+ /** user features */
+ protected volatile double[][] userVectors;
+ /** item features */
+ protected volatile double[][] itemVectors;
+
+ private final PreferenceShuffler shuffler;
+
+ private int epoch = 1;
+ /** place in user vector where the bias is stored */
+ private static final int USER_BIAS_INDEX = 1;
+ /** place in item vector where the bias is stored */
+ private static final int ITEM_BIAS_INDEX = 2;
+ private static final int FEATURE_OFFSET = 3;
+ /** Standard deviation for random initialization of features */
+ private static final double NOISE = 0.02;
+
+ private static final Logger logger = LoggerFactory.getLogger(ParallelSGDFactorizer.class);
+
+ protected static class PreferenceShuffler {
+
+ private Preference[] preferences;
+ private Preference[] unstagedPreferences;
+
+ protected final RandomWrapper random = RandomUtils.getRandom();
+
+ public PreferenceShuffler(DataModel dataModel) throws TasteException {
+ cachePreferences(dataModel);
+ shuffle();
+ stage();
+ }
+
+ private int countPreferences(DataModel dataModel) throws TasteException {
+ int numPreferences = 0;
+ LongPrimitiveIterator userIDs = dataModel.getUserIDs();
+ while (userIDs.hasNext()) {
+ PreferenceArray preferencesFromUser = dataModel.getPreferencesFromUser(userIDs.nextLong());
+ numPreferences += preferencesFromUser.length();
+ }
+ return numPreferences;
+ }
+
+ private void cachePreferences(DataModel dataModel) throws TasteException {
+ int numPreferences = countPreferences(dataModel);
+ preferences = new Preference[numPreferences];
+
+ LongPrimitiveIterator userIDs = dataModel.getUserIDs();
+ int index = 0;
+ while (userIDs.hasNext()) {
+ long userID = userIDs.nextLong();
+ PreferenceArray preferencesFromUser = dataModel.getPreferencesFromUser(userID);
+ for (Preference preference : preferencesFromUser) {
+ preferences[index++] = preference;
+ }
+ }
+ }
+
+ public final void shuffle() {
+ unstagedPreferences = preferences.clone();
+ /* Durstenfeld shuffle */
+ for (int i = unstagedPreferences.length - 1; i > 0; i--) {
+ int rand = random.nextInt(i + 1);
+ swapCachedPreferences(i, rand);
+ }
+ }
+
+ //merge this part into shuffle() will make compiler-optimizer do some real absurd stuff, test on OpenJDK7
+ private void swapCachedPreferences(int x, int y) {
+ Preference p = unstagedPreferences[x];
+
+ unstagedPreferences[x] = unstagedPreferences[y];
+ unstagedPreferences[y] = p;
+ }
+
+ public final void stage() {
+ preferences = unstagedPreferences;
+ }
+
+ public Preference get(int i) {
+ return preferences[i];
+ }
+
+ public int size() {
+ return preferences.length;
+ }
+
+ }
+
+ public ParallelSGDFactorizer(DataModel dataModel, int numFeatures, double lambda, int numEpochs)
+ throws TasteException {
+ super(dataModel);
+ this.dataModel = dataModel;
+ this.rank = numFeatures + FEATURE_OFFSET;
+ this.lambda = lambda;
+ this.numEpochs = numEpochs;
+
+ shuffler = new PreferenceShuffler(dataModel);
+
+ //max thread num set to n^0.25 as suggested by hogwild! paper
+ numThreads = Math.min(Runtime.getRuntime().availableProcessors(), (int) Math.pow((double) shuffler.size(), 0.25));
+ }
+
+ public ParallelSGDFactorizer(DataModel dataModel, int numFeatures, double lambda, int numIterations,
+ double mu0, double decayFactor, int stepOffset, double forgettingExponent) throws TasteException {
+ this(dataModel, numFeatures, lambda, numIterations);
+
+ this.mu0 = mu0;
+ this.decayFactor = decayFactor;
+ this.stepOffset = stepOffset;
+ this.forgettingExponent = forgettingExponent;
+ }
+
+ public ParallelSGDFactorizer(DataModel dataModel, int numFeatures, double lambda, int numIterations,
+ double mu0, double decayFactor, int stepOffset, double forgettingExponent, int numThreads) throws TasteException {
+ this(dataModel, numFeatures, lambda, numIterations, mu0, decayFactor, stepOffset, forgettingExponent);
+
+ this.numThreads = numThreads;
+ }
+
+ public ParallelSGDFactorizer(DataModel dataModel, int numFeatures, double lambda, int numIterations,
+ double mu0, double decayFactor, int stepOffset, double forgettingExponent,
+ double biasMuRatio, double biasLambdaRatio) throws TasteException {
+ this(dataModel, numFeatures, lambda, numIterations, mu0, decayFactor, stepOffset, forgettingExponent);
+
+ this.biasMuRatio = biasMuRatio;
+ this.biasLambdaRatio = biasLambdaRatio;
+ }
+
+ public ParallelSGDFactorizer(DataModel dataModel, int numFeatures, double lambda, int numIterations,
+ double mu0, double decayFactor, int stepOffset, double forgettingExponent,
+ double biasMuRatio, double biasLambdaRatio, int numThreads) throws TasteException {
+ this(dataModel, numFeatures, lambda, numIterations, mu0, decayFactor, stepOffset, forgettingExponent, biasMuRatio,
+ biasLambdaRatio);
+
+ this.numThreads = numThreads;
+ }
+
+ protected void initialize() throws TasteException {
+ RandomWrapper random = RandomUtils.getRandom();
+ userVectors = new double[dataModel.getNumUsers()][rank];
+ itemVectors = new double[dataModel.getNumItems()][rank];
+
+ double globalAverage = getAveragePreference();
+ for (int userIndex = 0; userIndex < userVectors.length; userIndex++) {
+ userVectors[userIndex][0] = globalAverage;
+ userVectors[userIndex][USER_BIAS_INDEX] = 0; // will store user bias
+ userVectors[userIndex][ITEM_BIAS_INDEX] = 1; // corresponding item feature contains item bias
+ for (int feature = FEATURE_OFFSET; feature < rank; feature++) {
+ userVectors[userIndex][feature] = random.nextGaussian() * NOISE;
+ }
+ }
+ for (int itemIndex = 0; itemIndex < itemVectors.length; itemIndex++) {
+ itemVectors[itemIndex][0] = 1; // corresponding user feature contains global average
+ itemVectors[itemIndex][USER_BIAS_INDEX] = 1; // corresponding user feature contains user bias
+ itemVectors[itemIndex][ITEM_BIAS_INDEX] = 0; // will store item bias
+ for (int feature = FEATURE_OFFSET; feature < rank; feature++) {
+ itemVectors[itemIndex][feature] = random.nextGaussian() * NOISE;
+ }
+ }
+ }
+
+ //TODO: needs optimization
+ private double getMu(int i) {
+ return mu0 * Math.pow(decayFactor, i - 1) * Math.pow(i + stepOffset, forgettingExponent);
+ }
+
+ @Override
+ public Factorization factorize() throws TasteException {
+ initialize();
+
+ if (logger.isInfoEnabled()) {
+ logger.info("starting to compute the factorization...");
+ }
+
+ for (epoch = 1; epoch <= numEpochs; epoch++) {
+ shuffler.stage();
+
+ final double mu = getMu(epoch);
+ int subSize = shuffler.size() / numThreads + 1;
+
+ ExecutorService executor=Executors.newFixedThreadPool(numThreads);
+
+ try {
+ for (int t = 0; t < numThreads; t++) {
+ final int iStart = t * subSize;
+ final int iEnd = Math.min((t + 1) * subSize, shuffler.size());
+
+ executor.execute(new Runnable() {
+ @Override
+ public void run() {
+ for (int i = iStart; i < iEnd; i++) {
+ update(shuffler.get(i), mu);
+ }
+ }
+ });
+ }
+ } finally {
+ executor.shutdown();
+ shuffler.shuffle();
+
+ try {
+ boolean terminated = executor.awaitTermination(numEpochs * shuffler.size(), TimeUnit.MICROSECONDS);
+ if (!terminated) {
+ logger.error("subtasks takes forever, return anyway");
+ }
+ } catch (InterruptedException e) {
+ throw new TasteException("waiting fof termination interrupted", e);
+ }
+ }
+
+ }
+
+ return createFactorization(userVectors, itemVectors);
+ }
+
+ double getAveragePreference() throws TasteException {
+ RunningAverage average = new FullRunningAverage();
+ LongPrimitiveIterator it = dataModel.getUserIDs();
+ while (it.hasNext()) {
+ for (Preference pref : dataModel.getPreferencesFromUser(it.nextLong())) {
+ average.addDatum(pref.getValue());
+ }
+ }
+ return average.getAverage();
+ }
+
+ /** TODO: this is the vanilla sgd by Tacaks 2009, I speculate that using scaling technique proposed in:
+ * Towards Optimal One Pass Large Scale Learning with Averaged Stochastic Gradient Descent section 5, page 6
+ * can be beneficial in term s of both speed and accuracy.
+ *
+ * Tacaks' method doesn't calculate gradient of regularization correctly, which has non-zero elements everywhere of
+ * the matrix. While Tacaks' method can only updates a single row/column, if one user has a lot of recommendation,
+ * her vector will be more affected by regularization using an isolated scaling factor for both user vectors and
+ * item vectors can remove this issue without inducing more update cost it even reduces it a bit by only performing
+ * one addition and one multiplication.
+ *
+ * BAD SIDE1: the scaling factor decreases fast, it has to be scaled up from time to time before dropped to zero or
+ * caused roundoff error
+ * BAD SIDE2: no body experiment on it before, and people generally use very small lambda
+ * so it's impact on accuracy may still be unknown.
+ * BAD SIDE3: don't know how to make it work for L1-regularization or
+ * "pseudorank?" (sum of singular values)-regularization */
+ protected void update(Preference preference, double mu) {
+ int userIndex = userIndex(preference.getUserID());
+ int itemIndex = itemIndex(preference.getItemID());
+
+ double[] userVector = userVectors[userIndex];
+ double[] itemVector = itemVectors[itemIndex];
+
+ double prediction = dot(userVector, itemVector);
+ double err = preference.getValue() - prediction;
+
+ // adjust features
+ for (int k = FEATURE_OFFSET; k < rank; k++) {
+ double userFeature = userVector[k];
+ double itemFeature = itemVector[k];
+
+ userVector[k] += mu * (err * itemFeature - lambda * userFeature);
+ itemVector[k] += mu * (err * userFeature - lambda * itemFeature);
+ }
+
+ // adjust user and item bias
+ userVector[USER_BIAS_INDEX] += biasMuRatio * mu * (err - biasLambdaRatio * lambda * userVector[USER_BIAS_INDEX]);
+ itemVector[ITEM_BIAS_INDEX] += biasMuRatio * mu * (err - biasLambdaRatio * lambda * itemVector[ITEM_BIAS_INDEX]);
+ }
+
+ private double dot(double[] userVector, double[] itemVector) {
+ double sum = 0;
+ for (int k = 0; k < rank; k++) {
+ sum += userVector[k] * itemVector[k];
+ }
+ return sum;
+ }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/PersistenceStrategy.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/PersistenceStrategy.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/PersistenceStrategy.java
new file mode 100644
index 0000000..abf3eca
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/PersistenceStrategy.java
@@ -0,0 +1,46 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.recommender.svd;
+
+import java.io.IOException;
+
+/**
+ * Provides storage for {@link Factorization}s
+ */
+public interface PersistenceStrategy {
+
+ /**
+ * Load a factorization from a persistent store.
+ *
+ * @return a Factorization or null if the persistent store is empty.
+ *
+ * @throws IOException
+ */
+ Factorization load() throws IOException;
+
+ /**
+ * Write a factorization to a persistent store unless it already
+ * contains an identical factorization.
+ *
+ * @param factorization
+ *
+ * @throws IOException
+ */
+ void maybePersist(Factorization factorization) throws IOException;
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/RatingSGDFactorizer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/RatingSGDFactorizer.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/RatingSGDFactorizer.java
new file mode 100644
index 0000000..2c9f0ae
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/svd/RatingSGDFactorizer.java
@@ -0,0 +1,221 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.recommender.svd;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FullRunningAverage;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+import org.apache.mahout.cf.taste.impl.common.RunningAverage;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.Preference;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.common.RandomWrapper;
+
+/** Matrix factorization with user and item biases for rating prediction, trained with plain vanilla SGD */
+public class RatingSGDFactorizer extends AbstractFactorizer {
+
+ protected static final int FEATURE_OFFSET = 3;
+
+ /** Multiplicative decay factor for learning_rate */
+ protected final double learningRateDecay;
+ /** Learning rate (step size) */
+ protected final double learningRate;
+ /** Parameter used to prevent overfitting. */
+ protected final double preventOverfitting;
+ /** Number of features used to compute this factorization */
+ protected final int numFeatures;
+ /** Number of iterations */
+ private final int numIterations;
+ /** Standard deviation for random initialization of features */
+ protected final double randomNoise;
+ /** User features */
+ protected double[][] userVectors;
+ /** Item features */
+ protected double[][] itemVectors;
+ protected final DataModel dataModel;
+ private long[] cachedUserIDs;
+ private long[] cachedItemIDs;
+
+ protected double biasLearningRate = 0.5;
+ protected double biasReg = 0.1;
+
+ /** place in user vector where the bias is stored */
+ protected static final int USER_BIAS_INDEX = 1;
+ /** place in item vector where the bias is stored */
+ protected static final int ITEM_BIAS_INDEX = 2;
+
+ public RatingSGDFactorizer(DataModel dataModel, int numFeatures, int numIterations) throws TasteException {
+ this(dataModel, numFeatures, 0.01, 0.1, 0.01, numIterations, 1.0);
+ }
+
+ public RatingSGDFactorizer(DataModel dataModel, int numFeatures, double learningRate, double preventOverfitting,
+ double randomNoise, int numIterations, double learningRateDecay) throws TasteException {
+ super(dataModel);
+ this.dataModel = dataModel;
+ this.numFeatures = numFeatures + FEATURE_OFFSET;
+ this.numIterations = numIterations;
+
+ this.learningRate = learningRate;
+ this.learningRateDecay = learningRateDecay;
+ this.preventOverfitting = preventOverfitting;
+ this.randomNoise = randomNoise;
+ }
+
+ protected void prepareTraining() throws TasteException {
+ RandomWrapper random = RandomUtils.getRandom();
+ userVectors = new double[dataModel.getNumUsers()][numFeatures];
+ itemVectors = new double[dataModel.getNumItems()][numFeatures];
+
+ double globalAverage = getAveragePreference();
+ for (int userIndex = 0; userIndex < userVectors.length; userIndex++) {
+ userVectors[userIndex][0] = globalAverage;
+ userVectors[userIndex][USER_BIAS_INDEX] = 0; // will store user bias
+ userVectors[userIndex][ITEM_BIAS_INDEX] = 1; // corresponding item feature contains item bias
+ for (int feature = FEATURE_OFFSET; feature < numFeatures; feature++) {
+ userVectors[userIndex][feature] = random.nextGaussian() * randomNoise;
+ }
+ }
+ for (int itemIndex = 0; itemIndex < itemVectors.length; itemIndex++) {
+ itemVectors[itemIndex][0] = 1; // corresponding user feature contains global average
+ itemVectors[itemIndex][USER_BIAS_INDEX] = 1; // corresponding user feature contains user bias
+ itemVectors[itemIndex][ITEM_BIAS_INDEX] = 0; // will store item bias
+ for (int feature = FEATURE_OFFSET; feature < numFeatures; feature++) {
+ itemVectors[itemIndex][feature] = random.nextGaussian() * randomNoise;
+ }
+ }
+
+ cachePreferences();
+ shufflePreferences();
+ }
+
+ private int countPreferences() throws TasteException {
+ int numPreferences = 0;
+ LongPrimitiveIterator userIDs = dataModel.getUserIDs();
+ while (userIDs.hasNext()) {
+ PreferenceArray preferencesFromUser = dataModel.getPreferencesFromUser(userIDs.nextLong());
+ numPreferences += preferencesFromUser.length();
+ }
+ return numPreferences;
+ }
+
+ private void cachePreferences() throws TasteException {
+ int numPreferences = countPreferences();
+ cachedUserIDs = new long[numPreferences];
+ cachedItemIDs = new long[numPreferences];
+
+ LongPrimitiveIterator userIDs = dataModel.getUserIDs();
+ int index = 0;
+ while (userIDs.hasNext()) {
+ long userID = userIDs.nextLong();
+ PreferenceArray preferencesFromUser = dataModel.getPreferencesFromUser(userID);
+ for (Preference preference : preferencesFromUser) {
+ cachedUserIDs[index] = userID;
+ cachedItemIDs[index] = preference.getItemID();
+ index++;
+ }
+ }
+ }
+
+ protected void shufflePreferences() {
+ RandomWrapper random = RandomUtils.getRandom();
+ /* Durstenfeld shuffle */
+ for (int currentPos = cachedUserIDs.length - 1; currentPos > 0; currentPos--) {
+ int swapPos = random.nextInt(currentPos + 1);
+ swapCachedPreferences(currentPos, swapPos);
+ }
+ }
+
+ private void swapCachedPreferences(int posA, int posB) {
+ long tmpUserIndex = cachedUserIDs[posA];
+ long tmpItemIndex = cachedItemIDs[posA];
+
+ cachedUserIDs[posA] = cachedUserIDs[posB];
+ cachedItemIDs[posA] = cachedItemIDs[posB];
+
+ cachedUserIDs[posB] = tmpUserIndex;
+ cachedItemIDs[posB] = tmpItemIndex;
+ }
+
+ @Override
+ public Factorization factorize() throws TasteException {
+ prepareTraining();
+ double currentLearningRate = learningRate;
+
+
+ for (int it = 0; it < numIterations; it++) {
+ for (int index = 0; index < cachedUserIDs.length; index++) {
+ long userId = cachedUserIDs[index];
+ long itemId = cachedItemIDs[index];
+ float rating = dataModel.getPreferenceValue(userId, itemId);
+ updateParameters(userId, itemId, rating, currentLearningRate);
+ }
+ currentLearningRate *= learningRateDecay;
+ }
+ return createFactorization(userVectors, itemVectors);
+ }
+
+ double getAveragePreference() throws TasteException {
+ RunningAverage average = new FullRunningAverage();
+ LongPrimitiveIterator it = dataModel.getUserIDs();
+ while (it.hasNext()) {
+ for (Preference pref : dataModel.getPreferencesFromUser(it.nextLong())) {
+ average.addDatum(pref.getValue());
+ }
+ }
+ return average.getAverage();
+ }
+
+ protected void updateParameters(long userID, long itemID, float rating, double currentLearningRate) {
+ int userIndex = userIndex(userID);
+ int itemIndex = itemIndex(itemID);
+
+ double[] userVector = userVectors[userIndex];
+ double[] itemVector = itemVectors[itemIndex];
+ double prediction = predictRating(userIndex, itemIndex);
+ double err = rating - prediction;
+
+ // adjust user bias
+ userVector[USER_BIAS_INDEX] +=
+ biasLearningRate * currentLearningRate * (err - biasReg * preventOverfitting * userVector[USER_BIAS_INDEX]);
+
+ // adjust item bias
+ itemVector[ITEM_BIAS_INDEX] +=
+ biasLearningRate * currentLearningRate * (err - biasReg * preventOverfitting * itemVector[ITEM_BIAS_INDEX]);
+
+ // adjust features
+ for (int feature = FEATURE_OFFSET; feature < numFeatures; feature++) {
+ double userFeature = userVector[feature];
+ double itemFeature = itemVector[feature];
+
+ double deltaUserFeature = err * itemFeature - preventOverfitting * userFeature;
+ userVector[feature] += currentLearningRate * deltaUserFeature;
+
+ double deltaItemFeature = err * userFeature - preventOverfitting * itemFeature;
+ itemVector[feature] += currentLearningRate * deltaItemFeature;
+ }
+ }
+
+ private double predictRating(int userID, int itemID) {
+ double sum = 0;
+ for (int feature = 0; feature < numFeatures; feature++) {
+ sum += userVectors[userID][feature] * itemVectors[itemID][feature];
+ }
+ return sum;
+ }
+}
r***@apache.org
2018-06-28 14:55:00 UTC
Permalink
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/streaming/tools/IOUtils.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/streaming/tools/IOUtils.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/streaming/tools/IOUtils.java
new file mode 100644
index 0000000..bd1149b
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/streaming/tools/IOUtils.java
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.streaming.tools;
+
+import com.google.common.base.Function;
+import com.google.common.base.Preconditions;
+import com.google.common.collect.Iterables;
+import org.apache.mahout.clustering.iterator.ClusterWritable;
+import org.apache.mahout.clustering.streaming.mapreduce.CentroidWritable;
+import org.apache.mahout.math.Centroid;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+
+public class IOUtils {
+
+ private IOUtils() {}
+
+ /**
+ * Converts CentroidWritable values in a sequence file into Centroids lazily.
+ * @param dirIterable the source iterable (comes from a SequenceFileDirIterable).
+ * @return an Iterable<Centroid> with the converted vectors.
+ */
+ public static Iterable<Centroid> getCentroidsFromCentroidWritableIterable(
+ Iterable<CentroidWritable> dirIterable) {
+ return Iterables.transform(dirIterable, new Function<CentroidWritable, Centroid>() {
+ @Override
+ public Centroid apply(CentroidWritable input) {
+ Preconditions.checkNotNull(input);
+ return input.getCentroid().clone();
+ }
+ });
+ }
+
+ /**
+ * Converts CentroidWritable values in a sequence file into Centroids lazily.
+ * @param dirIterable the source iterable (comes from a SequenceFileDirIterable).
+ * @return an Iterable<Centroid> with the converted vectors.
+ */
+ public static Iterable<Centroid> getCentroidsFromClusterWritableIterable(Iterable<ClusterWritable> dirIterable) {
+ return Iterables.transform(dirIterable, new Function<ClusterWritable, Centroid>() {
+ int numClusters = 0;
+ @Override
+ public Centroid apply(ClusterWritable input) {
+ Preconditions.checkNotNull(input);
+ return new Centroid(numClusters++, input.getValue().getCenter().clone(),
+ input.getValue().getTotalObservations());
+ }
+ });
+ }
+
+ /**
+ * Converts VectorWritable values in a sequence file into Vectors lazily.
+ * @param dirIterable the source iterable (comes from a SequenceFileDirIterable).
+ * @return an Iterable<Vector> with the converted vectors.
+ */
+ public static Iterable<Vector> getVectorsFromVectorWritableIterable(Iterable<VectorWritable> dirIterable) {
+ return Iterables.transform(dirIterable, new Function<VectorWritable, Vector>() {
+ @Override
+ public Vector apply(VectorWritable input) {
+ Preconditions.checkNotNull(input);
+ return input.get().clone();
+ }
+ });
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java
new file mode 100644
index 0000000..083cd8c
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java
@@ -0,0 +1,125 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.syntheticcontrol.canopy;
+
+import java.util.List;
+import java.util.Map;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.clustering.canopy.CanopyDriver;
+import org.apache.mahout.clustering.conversion.InputDriver;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.ClassUtils;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
+import org.apache.mahout.utils.clustering.ClusterDumper;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+@Deprecated
+public final class Job extends AbstractJob {
+
+ private static final String DIRECTORY_CONTAINING_CONVERTED_INPUT = "data";
+
+ private Job() {
+ }
+
+ private static final Logger log = LoggerFactory.getLogger(Job.class);
+
+ public static void main(String[] args) throws Exception {
+ if (args.length > 0) {
+ log.info("Running with only user-supplied arguments");
+ ToolRunner.run(new Configuration(), new Job(), args);
+ } else {
+ log.info("Running with default arguments");
+ Path output = new Path("output");
+ HadoopUtil.delete(new Configuration(), output);
+ run(new Path("testdata"), output, new EuclideanDistanceMeasure(), 80, 55);
+ }
+ }
+
+ /**
+ * Run the canopy clustering job on an input dataset using the given distance
+ * measure, t1 and t2 parameters. All output data will be written to the
+ * output directory, which will be initially deleted if it exists. The
+ * clustered points will reside in the path <output>/clustered-points. By
+ * default, the job expects the a file containing synthetic_control.data as
+ * obtained from
+ * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series
+ * resides in a directory named "testdata", and writes output to a directory
+ * named "output".
+ *
+ * @param input
+ * the String denoting the input directory path
+ * @param output
+ * the String denoting the output directory path
+ * @param measure
+ * the DistanceMeasure to use
+ * @param t1
+ * the canopy T1 threshold
+ * @param t2
+ * the canopy T2 threshold
+ */
+ private static void run(Path input, Path output, DistanceMeasure measure,
+ double t1, double t2) throws Exception {
+ Path directoryContainingConvertedInput = new Path(output,
+ DIRECTORY_CONTAINING_CONVERTED_INPUT);
+ InputDriver.runJob(input, directoryContainingConvertedInput,
+ "org.apache.mahout.math.RandomAccessSparseVector");
+ CanopyDriver.run(new Configuration(), directoryContainingConvertedInput,
+ output, measure, t1, t2, true, 0.0, false);
+ // run ClusterDumper
+ ClusterDumper clusterDumper = new ClusterDumper(new Path(output,
+ "clusters-0-final"), new Path(output, "clusteredPoints"));
+ clusterDumper.printClusters(null);
+ }
+
+ @Override
+ public int run(String[] args) throws Exception {
+
+ addInputOption();
+ addOutputOption();
+ addOption(DefaultOptionCreator.distanceMeasureOption().create());
+ addOption(DefaultOptionCreator.t1Option().create());
+ addOption(DefaultOptionCreator.t2Option().create());
+ addOption(DefaultOptionCreator.overwriteOption().create());
+
+ Map<String, List<String>> argMap = parseArguments(args);
+ if (argMap == null) {
+ return -1;
+ }
+
+ Path input = getInputPath();
+ Path output = getOutputPath();
+ if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
+ HadoopUtil.delete(new Configuration(), output);
+ }
+ String measureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION);
+ double t1 = Double.parseDouble(getOption(DefaultOptionCreator.T1_OPTION));
+ double t2 = Double.parseDouble(getOption(DefaultOptionCreator.T2_OPTION));
+ DistanceMeasure measure = ClassUtils.instantiateAs(measureClass, DistanceMeasure.class);
+
+ run(input, output, measure, t1, t2);
+ return 0;
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/fuzzykmeans/Job.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/fuzzykmeans/Job.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/fuzzykmeans/Job.java
new file mode 100644
index 0000000..43beb78
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/fuzzykmeans/Job.java
@@ -0,0 +1,144 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.syntheticcontrol.fuzzykmeans;
+
+import java.util.List;
+import java.util.Map;
+
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.clustering.canopy.CanopyDriver;
+import org.apache.mahout.clustering.conversion.InputDriver;
+import org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansDriver;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.ClassUtils;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
+import org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure;
+import org.apache.mahout.utils.clustering.ClusterDumper;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public final class Job extends AbstractJob {
+
+ private static final Logger log = LoggerFactory.getLogger(Job.class);
+
+ private static final String DIRECTORY_CONTAINING_CONVERTED_INPUT = "data";
+
+ private static final String M_OPTION = FuzzyKMeansDriver.M_OPTION;
+
+ private Job() {
+ }
+
+ public static void main(String[] args) throws Exception {
+ if (args.length > 0) {
+ log.info("Running with only user-supplied arguments");
+ ToolRunner.run(new Configuration(), new Job(), args);
+ } else {
+ log.info("Running with default arguments");
+ Path output = new Path("output");
+ Configuration conf = new Configuration();
+ HadoopUtil.delete(conf, output);
+ run(conf, new Path("testdata"), output, new EuclideanDistanceMeasure(), 80, 55, 10, 2.0f, 0.5);
+ }
+ }
+
+ @Override
+ public int run(String[] args) throws Exception {
+ addInputOption();
+ addOutputOption();
+ addOption(DefaultOptionCreator.distanceMeasureOption().create());
+ addOption(DefaultOptionCreator.convergenceOption().create());
+ addOption(DefaultOptionCreator.maxIterationsOption().create());
+ addOption(DefaultOptionCreator.overwriteOption().create());
+ addOption(DefaultOptionCreator.t1Option().create());
+ addOption(DefaultOptionCreator.t2Option().create());
+ addOption(M_OPTION, M_OPTION, "coefficient normalization factor, must be greater than 1", true);
+
+ Map<String,List<String>> argMap = parseArguments(args);
+ if (argMap == null) {
+ return -1;
+ }
+
+ Path input = getInputPath();
+ Path output = getOutputPath();
+ String measureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION);
+ if (measureClass == null) {
+ measureClass = SquaredEuclideanDistanceMeasure.class.getName();
+ }
+ double convergenceDelta = Double.parseDouble(getOption(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION));
+ int maxIterations = Integer.parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION));
+ float fuzziness = Float.parseFloat(getOption(M_OPTION));
+
+ addOption(new DefaultOptionBuilder().withLongName(M_OPTION).withRequired(true)
+ .withArgument(new ArgumentBuilder().withName(M_OPTION).withMinimum(1).withMaximum(1).create())
+ .withDescription("coefficient normalization factor, must be greater than 1").withShortName(M_OPTION).create());
+ if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
+ HadoopUtil.delete(getConf(), output);
+ }
+ DistanceMeasure measure = ClassUtils.instantiateAs(measureClass, DistanceMeasure.class);
+ double t1 = Double.parseDouble(getOption(DefaultOptionCreator.T1_OPTION));
+ double t2 = Double.parseDouble(getOption(DefaultOptionCreator.T2_OPTION));
+ run(getConf(), input, output, measure, t1, t2, maxIterations, fuzziness, convergenceDelta);
+ return 0;
+ }
+
+ /**
+ * Run the kmeans clustering job on an input dataset using the given distance measure, t1, t2 and iteration
+ * parameters. All output data will be written to the output directory, which will be initially deleted if it exists.
+ * The clustered points will reside in the path <output>/clustered-points. By default, the job expects the a file
+ * containing synthetic_control.data as obtained from
+ * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series resides in a directory named "testdata",
+ * and writes output to a directory named "output".
+ *
+ * @param input
+ * the String denoting the input directory path
+ * @param output
+ * the String denoting the output directory path
+ * @param t1
+ * the canopy T1 threshold
+ * @param t2
+ * the canopy T2 threshold
+ * @param maxIterations
+ * the int maximum number of iterations
+ * @param fuzziness
+ * the float "m" fuzziness coefficient
+ * @param convergenceDelta
+ * the double convergence criteria for iterations
+ */
+ public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, double t1, double t2,
+ int maxIterations, float fuzziness, double convergenceDelta) throws Exception {
+ Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT);
+ log.info("Preparing Input");
+ InputDriver.runJob(input, directoryContainingConvertedInput, "org.apache.mahout.math.RandomAccessSparseVector");
+ log.info("Running Canopy to get initial clusters");
+ Path canopyOutput = new Path(output, "canopies");
+ CanopyDriver.run(new Configuration(), directoryContainingConvertedInput, canopyOutput, measure, t1, t2, false, 0.0, false);
+ log.info("Running FuzzyKMeans");
+ FuzzyKMeansDriver.run(directoryContainingConvertedInput, new Path(canopyOutput, "clusters-0-final"), output,
+ convergenceDelta, maxIterations, fuzziness, true, true, 0.0, false);
+ // run ClusterDumper
+ ClusterDumper clusterDumper = new ClusterDumper(new Path(output, "clusters-*-final"), new Path(output, "clusteredPoints"));
+ clusterDumper.printClusters(null);
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java
new file mode 100644
index 0000000..70c41fe
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java
@@ -0,0 +1,187 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.syntheticcontrol.kmeans;
+
+import java.util.List;
+import java.util.Map;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.clustering.Cluster;
+import org.apache.mahout.clustering.canopy.CanopyDriver;
+import org.apache.mahout.clustering.conversion.InputDriver;
+import org.apache.mahout.clustering.kmeans.KMeansDriver;
+import org.apache.mahout.clustering.kmeans.RandomSeedGenerator;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.ClassUtils;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
+import org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure;
+import org.apache.mahout.utils.clustering.ClusterDumper;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public final class Job extends AbstractJob {
+
+ private static final Logger log = LoggerFactory.getLogger(Job.class);
+
+ private static final String DIRECTORY_CONTAINING_CONVERTED_INPUT = "data";
+
+ private Job() {
+ }
+
+ public static void main(String[] args) throws Exception {
+ if (args.length > 0) {
+ log.info("Running with only user-supplied arguments");
+ ToolRunner.run(new Configuration(), new Job(), args);
+ } else {
+ log.info("Running with default arguments");
+ Path output = new Path("output");
+ Configuration conf = new Configuration();
+ HadoopUtil.delete(conf, output);
+ run(conf, new Path("testdata"), output, new EuclideanDistanceMeasure(), 6, 0.5, 10);
+ }
+ }
+
+ @Override
+ public int run(String[] args) throws Exception {
+ addInputOption();
+ addOutputOption();
+ addOption(DefaultOptionCreator.distanceMeasureOption().create());
+ addOption(DefaultOptionCreator.numClustersOption().create());
+ addOption(DefaultOptionCreator.t1Option().create());
+ addOption(DefaultOptionCreator.t2Option().create());
+ addOption(DefaultOptionCreator.convergenceOption().create());
+ addOption(DefaultOptionCreator.maxIterationsOption().create());
+ addOption(DefaultOptionCreator.overwriteOption().create());
+
+ Map<String,List<String>> argMap = parseArguments(args);
+ if (argMap == null) {
+ return -1;
+ }
+
+ Path input = getInputPath();
+ Path output = getOutputPath();
+ String measureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION);
+ if (measureClass == null) {
+ measureClass = SquaredEuclideanDistanceMeasure.class.getName();
+ }
+ double convergenceDelta = Double.parseDouble(getOption(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION));
+ int maxIterations = Integer.parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION));
+ if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
+ HadoopUtil.delete(getConf(), output);
+ }
+ DistanceMeasure measure = ClassUtils.instantiateAs(measureClass, DistanceMeasure.class);
+ if (hasOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION)) {
+ int k = Integer.parseInt(getOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION));
+ run(getConf(), input, output, measure, k, convergenceDelta, maxIterations);
+ } else {
+ double t1 = Double.parseDouble(getOption(DefaultOptionCreator.T1_OPTION));
+ double t2 = Double.parseDouble(getOption(DefaultOptionCreator.T2_OPTION));
+ run(getConf(), input, output, measure, t1, t2, convergenceDelta, maxIterations);
+ }
+ return 0;
+ }
+
+ /**
+ * Run the kmeans clustering job on an input dataset using the given the number of clusters k and iteration
+ * parameters. All output data will be written to the output directory, which will be initially deleted if it exists.
+ * The clustered points will reside in the path <output>/clustered-points. By default, the job expects a file
+ * containing equal length space delimited data that resides in a directory named "testdata", and writes output to a
+ * directory named "output".
+ *
+ * @param conf
+ * the Configuration to use
+ * @param input
+ * the String denoting the input directory path
+ * @param output
+ * the String denoting the output directory path
+ * @param measure
+ * the DistanceMeasure to use
+ * @param k
+ * the number of clusters in Kmeans
+ * @param convergenceDelta
+ * the double convergence criteria for iterations
+ * @param maxIterations
+ * the int maximum number of iterations
+ */
+ public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, int k,
+ double convergenceDelta, int maxIterations) throws Exception {
+ Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT);
+ log.info("Preparing Input");
+ InputDriver.runJob(input, directoryContainingConvertedInput, "org.apache.mahout.math.RandomAccessSparseVector");
+ log.info("Running random seed to get initial clusters");
+ Path clusters = new Path(output, "random-seeds");
+ clusters = RandomSeedGenerator.buildRandom(conf, directoryContainingConvertedInput, clusters, k, measure);
+ log.info("Running KMeans with k = {}", k);
+ KMeansDriver.run(conf, directoryContainingConvertedInput, clusters, output, convergenceDelta,
+ maxIterations, true, 0.0, false);
+ // run ClusterDumper
+ Path outGlob = new Path(output, "clusters-*-final");
+ Path clusteredPoints = new Path(output,"clusteredPoints");
+ log.info("Dumping out clusters from clusters: {} and clusteredPoints: {}", outGlob, clusteredPoints);
+ ClusterDumper clusterDumper = new ClusterDumper(outGlob, clusteredPoints);
+ clusterDumper.printClusters(null);
+ }
+
+ /**
+ * Run the kmeans clustering job on an input dataset using the given distance measure, t1, t2 and iteration
+ * parameters. All output data will be written to the output directory, which will be initially deleted if it exists.
+ * The clustered points will reside in the path <output>/clustered-points. By default, the job expects the a file
+ * containing synthetic_control.data as obtained from
+ * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series resides in a directory named "testdata",
+ * and writes output to a directory named "output".
+ *
+ * @param conf
+ * the Configuration to use
+ * @param input
+ * the String denoting the input directory path
+ * @param output
+ * the String denoting the output directory path
+ * @param measure
+ * the DistanceMeasure to use
+ * @param t1
+ * the canopy T1 threshold
+ * @param t2
+ * the canopy T2 threshold
+ * @param convergenceDelta
+ * the double convergence criteria for iterations
+ * @param maxIterations
+ * the int maximum number of iterations
+ */
+ public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, double t1, double t2,
+ double convergenceDelta, int maxIterations) throws Exception {
+ Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT);
+ log.info("Preparing Input");
+ InputDriver.runJob(input, directoryContainingConvertedInput, "org.apache.mahout.math.RandomAccessSparseVector");
+ log.info("Running Canopy to get initial clusters");
+ Path canopyOutput = new Path(output, "canopies");
+ CanopyDriver.run(new Configuration(), directoryContainingConvertedInput, canopyOutput, measure, t1, t2, false, 0.0,
+ false);
+ log.info("Running KMeans");
+ KMeansDriver.run(conf, directoryContainingConvertedInput, new Path(canopyOutput, Cluster.INITIAL_CLUSTERS_DIR
+ + "-final"), output, convergenceDelta, maxIterations, true, 0.0, false);
+ // run ClusterDumper
+ ClusterDumper clusterDumper = new ClusterDumper(new Path(output, "clusters-*-final"), new Path(output,
+ "clusteredPoints"));
+ clusterDumper.printClusters(null);
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/DeliciousTagsExample.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/DeliciousTagsExample.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/DeliciousTagsExample.java
new file mode 100644
index 0000000..92363e5
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/DeliciousTagsExample.java
@@ -0,0 +1,94 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.fpm.pfpgrowth;
+
+import java.io.IOException;
+
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.OptionException;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.commandline.Parser;
+import org.apache.mahout.common.CommandLineUtil;
+import org.apache.mahout.common.Parameters;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.fpm.pfpgrowth.dataset.KeyBasedStringTupleGrouper;
+
+public final class DeliciousTagsExample {
+ private DeliciousTagsExample() { }
+
+ public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
+ DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
+ ArgumentBuilder abuilder = new ArgumentBuilder();
+ GroupBuilder gbuilder = new GroupBuilder();
+ Option inputDirOpt = DefaultOptionCreator.inputOption().create();
+
+ Option outputOpt = DefaultOptionCreator.outputOption().create();
+
+ Option helpOpt = DefaultOptionCreator.helpOption();
+ Option recordSplitterOpt = obuilder.withLongName("splitterPattern").withArgument(
+ abuilder.withName("splitterPattern").withMinimum(1).withMaximum(1).create()).withDescription(
+ "Regular Expression pattern used to split given line into fields."
+ + " Default value splits comma or tab separated fields."
+ + " Default Value: \"[ ,\\t]*\\t[ ,\\t]*\" ").withShortName("regex").create();
+ Option encodingOpt = obuilder.withLongName("encoding").withArgument(
+ abuilder.withName("encoding").withMinimum(1).withMaximum(1).create()).withDescription(
+ "(Optional) The file encoding. Default value: UTF-8").withShortName("e").create();
+ Group group = gbuilder.withName("Options").withOption(inputDirOpt).withOption(outputOpt).withOption(
+ helpOpt).withOption(recordSplitterOpt).withOption(encodingOpt).create();
+
+ try {
+ Parser parser = new Parser();
+ parser.setGroup(group);
+ CommandLine cmdLine = parser.parse(args);
+
+ if (cmdLine.hasOption(helpOpt)) {
+ CommandLineUtil.printHelp(group);
+ return;
+ }
+ Parameters params = new Parameters();
+ if (cmdLine.hasOption(recordSplitterOpt)) {
+ params.set("splitPattern", (String) cmdLine.getValue(recordSplitterOpt));
+ }
+
+ String encoding = "UTF-8";
+ if (cmdLine.hasOption(encodingOpt)) {
+ encoding = (String) cmdLine.getValue(encodingOpt);
+ }
+ params.set("encoding", encoding);
+ String inputDir = (String) cmdLine.getValue(inputDirOpt);
+ String outputDir = (String) cmdLine.getValue(outputOpt);
+ params.set("input", inputDir);
+ params.set("output", outputDir);
+ params.set("groupingFieldCount", "2");
+ params.set("gfield0", "1");
+ params.set("gfield1", "2");
+ params.set("selectedFieldCount", "1");
+ params.set("field0", "3");
+ params.set("maxTransactionLength", "100");
+ KeyBasedStringTupleGrouper.startJob(params);
+
+ } catch (OptionException ex) {
+ CommandLineUtil.printHelp(group);
+ }
+
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleCombiner.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleCombiner.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleCombiner.java
new file mode 100644
index 0000000..4c80a31
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleCombiner.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.fpm.pfpgrowth.dataset;
+
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.mahout.common.StringTuple;
+
+public class KeyBasedStringTupleCombiner extends Reducer<Text,StringTuple,Text,StringTuple> {
+
+ @Override
+ protected void reduce(Text key,
+ Iterable<StringTuple> values,
+ Context context) throws IOException, InterruptedException {
+ Set<String> outputValues = new HashSet<>();
+ for (StringTuple value : values) {
+ outputValues.addAll(value.getEntries());
+ }
+ context.write(key, new StringTuple(outputValues));
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleGrouper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleGrouper.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleGrouper.java
new file mode 100644
index 0000000..cd17770
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleGrouper.java
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.fpm.pfpgrowth.dataset;
+
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.Parameters;
+import org.apache.mahout.common.StringTuple;
+
+public final class KeyBasedStringTupleGrouper {
+
+ private KeyBasedStringTupleGrouper() { }
+
+ public static void startJob(Parameters params) throws IOException,
+ InterruptedException,
+ ClassNotFoundException {
+ Configuration conf = new Configuration();
+
+ conf.set("job.parameters", params.toString());
+ conf.set("mapred.compress.map.output", "true");
+ conf.set("mapred.output.compression.type", "BLOCK");
+ conf.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec");
+ conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
+ + "org.apache.hadoop.io.serializer.WritableSerialization");
+
+ String input = params.get("input");
+ Job job = new Job(conf, "Generating dataset based from input" + input);
+ job.setJarByClass(KeyBasedStringTupleGrouper.class);
+
+ job.setMapOutputKeyClass(Text.class);
+ job.setMapOutputValueClass(StringTuple.class);
+
+ job.setOutputKeyClass(Text.class);
+ job.setOutputValueClass(Text.class);
+
+ FileInputFormat.addInputPath(job, new Path(input));
+ Path outPath = new Path(params.get("output"));
+ FileOutputFormat.setOutputPath(job, outPath);
+
+ HadoopUtil.delete(conf, outPath);
+
+ job.setInputFormatClass(TextInputFormat.class);
+ job.setMapperClass(KeyBasedStringTupleMapper.class);
+ job.setCombinerClass(KeyBasedStringTupleCombiner.class);
+ job.setReducerClass(KeyBasedStringTupleReducer.class);
+ job.setOutputFormatClass(TextOutputFormat.class);
+
+ boolean succeeded = job.waitForCompletion(true);
+ if (!succeeded) {
+ throw new IllegalStateException("Job failed!");
+ }
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleMapper.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleMapper.java
new file mode 100644
index 0000000..362d1ce
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleMapper.java
@@ -0,0 +1,90 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.fpm.pfpgrowth.dataset;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+import java.util.regex.Pattern;
+
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.common.Parameters;
+import org.apache.mahout.common.StringTuple;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Splits the line using a {@link Pattern} and outputs key as given by the groupingFields
+ *
+ */
+public class KeyBasedStringTupleMapper extends Mapper<LongWritable,Text,Text,StringTuple> {
+
+ private static final Logger log = LoggerFactory.getLogger(KeyBasedStringTupleMapper.class);
+
+ private Pattern splitter;
+
+ private int[] selectedFields;
+
+ private int[] groupingFields;
+
+ @Override
+ protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
+ String[] fields = splitter.split(value.toString());
+ if (fields.length != 4) {
+ log.info("{} {}", fields.length, value.toString());
+ context.getCounter("Map", "ERROR").increment(1);
+ return;
+ }
+ Collection<String> oKey = new ArrayList<>();
+ for (int groupingField : groupingFields) {
+ oKey.add(fields[groupingField]);
+ context.setStatus(fields[groupingField]);
+ }
+
+ List<String> oValue = new ArrayList<>();
+ for (int selectedField : selectedFields) {
+ oValue.add(fields[selectedField]);
+ }
+
+ context.write(new Text(oKey.toString()), new StringTuple(oValue));
+
+ }
+
+ @Override
+ protected void setup(Context context) throws IOException, InterruptedException {
+ super.setup(context);
+ Parameters params = new Parameters(context.getConfiguration().get("job.parameters", ""));
+ splitter = Pattern.compile(params.get("splitPattern", "[ \t]*\t[ \t]*"));
+
+ int selectedFieldCount = Integer.valueOf(params.get("selectedFieldCount", "0"));
+ selectedFields = new int[selectedFieldCount];
+ for (int i = 0; i < selectedFieldCount; i++) {
+ selectedFields[i] = Integer.valueOf(params.get("field" + i, "0"));
+ }
+
+ int groupingFieldCount = Integer.valueOf(params.get("groupingFieldCount", "0"));
+ groupingFields = new int[groupingFieldCount];
+ for (int i = 0; i < groupingFieldCount; i++) {
+ groupingFields[i] = Integer.valueOf(params.get("gfield" + i, "0"));
+ }
+
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleReducer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleReducer.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleReducer.java
new file mode 100644
index 0000000..a7ef762
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleReducer.java
@@ -0,0 +1,74 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.fpm.pfpgrowth.dataset;
+
+import java.io.IOException;
+import java.util.Collection;
+import java.util.HashSet;
+
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.mahout.common.Parameters;
+import org.apache.mahout.common.StringTuple;
+
+public class KeyBasedStringTupleReducer extends Reducer<Text,StringTuple,Text,Text> {
+
+ private int maxTransactionLength = 100;
+
+ @Override
+ protected void reduce(Text key, Iterable<StringTuple> values, Context context)
+ throws IOException, InterruptedException {
+ Collection<String> items = new HashSet<>();
+
+ for (StringTuple value : values) {
+ for (String field : value.getEntries()) {
+ items.add(field);
+ }
+ }
+ if (items.size() > 1) {
+ int i = 0;
+ StringBuilder sb = new StringBuilder();
+ String sep = "";
+ for (String field : items) {
+ if (i % maxTransactionLength == 0) {
+ if (i != 0) {
+ context.write(null, new Text(sb.toString()));
+ }
+ sb.replace(0, sb.length(), "");
+ sep = "";
+ }
+
+ sb.append(sep).append(field);
+ sep = "\t";
+
+ i++;
+
+ }
+ if (sb.length() > 0) {
+ context.write(null, new Text(sb.toString()));
+ }
+ }
+ }
+
+ @Override
+ protected void setup(Context context) throws IOException, InterruptedException {
+ super.setup(context);
+ Parameters params = new Parameters(context.getConfiguration().get("job.parameters", ""));
+ maxTransactionLength = Integer.valueOf(params.get("maxTransactionLength", "100"));
+ }
+}
r***@apache.org
2018-06-28 14:54:43 UTC
Permalink
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/builder/DecisionTreeBuilder.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/builder/DecisionTreeBuilder.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/builder/DecisionTreeBuilder.java
new file mode 100644
index 0000000..9f84e9c
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/builder/DecisionTreeBuilder.java
@@ -0,0 +1,422 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.builder;
+
+import org.apache.mahout.classifier.df.data.Data;
+import org.apache.mahout.classifier.df.data.Dataset;
+import org.apache.mahout.classifier.df.data.Instance;
+import org.apache.mahout.classifier.df.data.conditions.Condition;
+import org.apache.mahout.classifier.df.node.CategoricalNode;
+import org.apache.mahout.classifier.df.node.Leaf;
+import org.apache.mahout.classifier.df.node.Node;
+import org.apache.mahout.classifier.df.node.NumericalNode;
+import org.apache.mahout.classifier.df.split.IgSplit;
+import org.apache.mahout.classifier.df.split.OptIgSplit;
+import org.apache.mahout.classifier.df.split.RegressionSplit;
+import org.apache.mahout.classifier.df.split.Split;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.Random;
+
+/**
+ * Builds a classification tree or regression tree<br>
+ * A classification tree is built when the criterion variable is the categorical attribute.<br>
+ * A regression tree is built when the criterion variable is the numerical attribute.
+ */
+@Deprecated
+public class DecisionTreeBuilder implements TreeBuilder {
+
+ private static final Logger log = LoggerFactory.getLogger(DecisionTreeBuilder.class);
+
+ private static final int[] NO_ATTRIBUTES = new int[0];
+ private static final double EPSILON = 1.0e-6;
+
+ /**
+ * indicates which CATEGORICAL attributes have already been selected in the parent nodes
+ */
+ private boolean[] selected;
+ /**
+ * number of attributes to select randomly at each node
+ */
+ private int m;
+ /**
+ * IgSplit implementation
+ */
+ private IgSplit igSplit;
+ /**
+ * tree is complemented
+ */
+ private boolean complemented = true;
+ /**
+ * minimum number for split
+ */
+ private double minSplitNum = 2.0;
+ /**
+ * minimum proportion of the total variance for split
+ */
+ private double minVarianceProportion = 1.0e-3;
+ /**
+ * full set data
+ */
+ private Data fullSet;
+ /**
+ * minimum variance for split
+ */
+ private double minVariance = Double.NaN;
+
+ public void setM(int m) {
+ this.m = m;
+ }
+
+ public void setIgSplit(IgSplit igSplit) {
+ this.igSplit = igSplit;
+ }
+
+ public void setComplemented(boolean complemented) {
+ this.complemented = complemented;
+ }
+
+ public void setMinSplitNum(int minSplitNum) {
+ this.minSplitNum = minSplitNum;
+ }
+
+ public void setMinVarianceProportion(double minVarianceProportion) {
+ this.minVarianceProportion = minVarianceProportion;
+ }
+
+ @Override
+ public Node build(Random rng, Data data) {
+ if (selected == null) {
+ selected = new boolean[data.getDataset().nbAttributes()];
+ selected[data.getDataset().getLabelId()] = true; // never select the label
+ }
+ if (m == 0) {
+ // set default m
+ double e = data.getDataset().nbAttributes() - 1;
+ if (data.getDataset().isNumerical(data.getDataset().getLabelId())) {
+ // regression
+ m = (int) Math.ceil(e / 3.0);
+ } else {
+ // classification
+ m = (int) Math.ceil(Math.sqrt(e));
+ }
+ }
+
+ if (data.isEmpty()) {
+ return new Leaf(Double.NaN);
+ }
+
+ double sum = 0.0;
+ if (data.getDataset().isNumerical(data.getDataset().getLabelId())) {
+ // regression
+ // sum and sum squared of a label is computed
+ double sumSquared = 0.0;
+ for (int i = 0; i < data.size(); i++) {
+ double label = data.getDataset().getLabel(data.get(i));
+ sum += label;
+ sumSquared += label * label;
+ }
+
+ // computes the variance
+ double var = sumSquared - (sum * sum) / data.size();
+
+ // computes the minimum variance
+ if (Double.compare(minVariance, Double.NaN) == 0) {
+ minVariance = var / data.size() * minVarianceProportion;
+ log.debug("minVariance:{}", minVariance);
+ }
+
+ // variance is compared with minimum variance
+ if ((var / data.size()) < minVariance) {
+ log.debug("variance({}) < minVariance({}) Leaf({})", var / data.size(), minVariance, sum / data.size());
+ return new Leaf(sum / data.size());
+ }
+ } else {
+ // classification
+ if (isIdentical(data)) {
+ return new Leaf(data.majorityLabel(rng));
+ }
+ if (data.identicalLabel()) {
+ return new Leaf(data.getDataset().getLabel(data.get(0)));
+ }
+ }
+
+ // store full set data
+ if (fullSet == null) {
+ fullSet = data;
+ }
+
+ int[] attributes = randomAttributes(rng, selected, m);
+ if (attributes == null || attributes.length == 0) {
+ // we tried all the attributes and could not split the data anymore
+ double label;
+ if (data.getDataset().isNumerical(data.getDataset().getLabelId())) {
+ // regression
+ label = sum / data.size();
+ } else {
+ // classification
+ label = data.majorityLabel(rng);
+ }
+ log.warn("attribute which can be selected is not found Leaf({})", label);
+ return new Leaf(label);
+ }
+
+ if (igSplit == null) {
+ if (data.getDataset().isNumerical(data.getDataset().getLabelId())) {
+ // regression
+ igSplit = new RegressionSplit();
+ } else {
+ // classification
+ igSplit = new OptIgSplit();
+ }
+ }
+
+ // find the best split
+ Split best = null;
+ for (int attr : attributes) {
+ Split split = igSplit.computeSplit(data, attr);
+ if (best == null || best.getIg() < split.getIg()) {
+ best = split;
+ }
+ }
+
+ // information gain is near to zero.
+ if (best.getIg() < EPSILON) {
+ double label;
+ if (data.getDataset().isNumerical(data.getDataset().getLabelId())) {
+ label = sum / data.size();
+ } else {
+ label = data.majorityLabel(rng);
+ }
+ log.debug("ig is near to zero Leaf({})", label);
+ return new Leaf(label);
+ }
+
+ log.debug("best split attr:{}, split:{}, ig:{}", best.getAttr(), best.getSplit(), best.getIg());
+
+ boolean alreadySelected = selected[best.getAttr()];
+ if (alreadySelected) {
+ // attribute already selected
+ log.warn("attribute {} already selected in a parent node", best.getAttr());
+ }
+
+ Node childNode;
+ if (data.getDataset().isNumerical(best.getAttr())) {
+ boolean[] temp = null;
+
+ Data loSubset = data.subset(Condition.lesser(best.getAttr(), best.getSplit()));
+ Data hiSubset = data.subset(Condition.greaterOrEquals(best.getAttr(), best.getSplit()));
+
+ if (loSubset.isEmpty() || hiSubset.isEmpty()) {
+ // the selected attribute did not change the data, avoid using it in the child notes
+ selected[best.getAttr()] = true;
+ } else {
+ // the data changed, so we can unselect all previousely selected NUMERICAL attributes
+ temp = selected;
+ selected = cloneCategoricalAttributes(data.getDataset(), selected);
+ }
+
+ // size of the subset is less than the minSpitNum
+ if (loSubset.size() < minSplitNum || hiSubset.size() < minSplitNum) {
+ // branch is not split
+ double label;
+ if (data.getDataset().isNumerical(data.getDataset().getLabelId())) {
+ label = sum / data.size();
+ } else {
+ label = data.majorityLabel(rng);
+ }
+ log.debug("branch is not split Leaf({})", label);
+ return new Leaf(label);
+ }
+
+ Node loChild = build(rng, loSubset);
+ Node hiChild = build(rng, hiSubset);
+
+ // restore the selection state of the attributes
+ if (temp != null) {
+ selected = temp;
+ } else {
+ selected[best.getAttr()] = alreadySelected;
+ }
+
+ childNode = new NumericalNode(best.getAttr(), best.getSplit(), loChild, hiChild);
+ } else { // CATEGORICAL attribute
+ double[] values = data.values(best.getAttr());
+
+ // tree is complemented
+ Collection<Double> subsetValues = null;
+ if (complemented) {
+ subsetValues = new HashSet<>();
+ for (double value : values) {
+ subsetValues.add(value);
+ }
+ values = fullSet.values(best.getAttr());
+ }
+
+ int cnt = 0;
+ Data[] subsets = new Data[values.length];
+ for (int index = 0; index < values.length; index++) {
+ if (complemented && !subsetValues.contains(values[index])) {
+ continue;
+ }
+ subsets[index] = data.subset(Condition.equals(best.getAttr(), values[index]));
+ if (subsets[index].size() >= minSplitNum) {
+ cnt++;
+ }
+ }
+
+ // size of the subset is less than the minSpitNum
+ if (cnt < 2) {
+ // branch is not split
+ double label;
+ if (data.getDataset().isNumerical(data.getDataset().getLabelId())) {
+ label = sum / data.size();
+ } else {
+ label = data.majorityLabel(rng);
+ }
+ log.debug("branch is not split Leaf({})", label);
+ return new Leaf(label);
+ }
+
+ selected[best.getAttr()] = true;
+
+ Node[] children = new Node[values.length];
+ for (int index = 0; index < values.length; index++) {
+ if (complemented && (subsetValues == null || !subsetValues.contains(values[index]))) {
+ // tree is complemented
+ double label;
+ if (data.getDataset().isNumerical(data.getDataset().getLabelId())) {
+ label = sum / data.size();
+ } else {
+ label = data.majorityLabel(rng);
+ }
+ log.debug("complemented Leaf({})", label);
+ children[index] = new Leaf(label);
+ continue;
+ }
+ children[index] = build(rng, subsets[index]);
+ }
+
+ selected[best.getAttr()] = alreadySelected;
+
+ childNode = new CategoricalNode(best.getAttr(), values, children);
+ }
+
+ return childNode;
+ }
+
+ /**
+ * checks if all the vectors have identical attribute values. Ignore selected attributes.
+ *
+ * @return true is all the vectors are identical or the data is empty<br>
+ * false otherwise
+ */
+ private boolean isIdentical(Data data) {
+ if (data.isEmpty()) {
+ return true;
+ }
+
+ Instance instance = data.get(0);
+ for (int attr = 0; attr < selected.length; attr++) {
+ if (selected[attr]) {
+ continue;
+ }
+
+ for (int index = 1; index < data.size(); index++) {
+ if (data.get(index).get(attr) != instance.get(attr)) {
+ return false;
+ }
+ }
+ }
+
+ return true;
+ }
+
+ /**
+ * Make a copy of the selection state of the attributes, unselect all numerical attributes
+ *
+ * @param selected selection state to clone
+ * @return cloned selection state
+ */
+ private static boolean[] cloneCategoricalAttributes(Dataset dataset, boolean[] selected) {
+ boolean[] cloned = new boolean[selected.length];
+
+ for (int i = 0; i < selected.length; i++) {
+ cloned[i] = !dataset.isNumerical(i) && selected[i];
+ }
+ cloned[dataset.getLabelId()] = true;
+
+ return cloned;
+ }
+
+ /**
+ * Randomly selects m attributes to consider for split, excludes IGNORED and LABEL attributes
+ *
+ * @param rng random-numbers generator
+ * @param selected attributes' state (selected or not)
+ * @param m number of attributes to choose
+ * @return list of selected attributes' indices, or null if all attributes have already been selected
+ */
+ private static int[] randomAttributes(Random rng, boolean[] selected, int m) {
+ int nbNonSelected = 0; // number of non selected attributes
+ for (boolean sel : selected) {
+ if (!sel) {
+ nbNonSelected++;
+ }
+ }
+
+ if (nbNonSelected == 0) {
+ log.warn("All attributes are selected !");
+ return NO_ATTRIBUTES;
+ }
+
+ int[] result;
+ if (nbNonSelected <= m) {
+ // return all non selected attributes
+ result = new int[nbNonSelected];
+ int index = 0;
+ for (int attr = 0; attr < selected.length; attr++) {
+ if (!selected[attr]) {
+ result[index++] = attr;
+ }
+ }
+ } else {
+ result = new int[m];
+ for (int index = 0; index < m; index++) {
+ // randomly choose a "non selected" attribute
+ int rind;
+ do {
+ rind = rng.nextInt(selected.length);
+ } while (selected[rind]);
+
+ result[index] = rind;
+ selected[rind] = true; // temporarily set the chosen attribute to be selected
+ }
+
+ // the chosen attributes are not yet selected
+ for (int attr : result) {
+ selected[attr] = false;
+ }
+ }
+
+ return result;
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/builder/DefaultTreeBuilder.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/builder/DefaultTreeBuilder.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/builder/DefaultTreeBuilder.java
new file mode 100644
index 0000000..3392fb1
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/builder/DefaultTreeBuilder.java
@@ -0,0 +1,253 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.builder;
+
+import org.apache.mahout.classifier.df.data.Data;
+import org.apache.mahout.classifier.df.data.Dataset;
+import org.apache.mahout.classifier.df.data.Instance;
+import org.apache.mahout.classifier.df.data.conditions.Condition;
+import org.apache.mahout.classifier.df.node.CategoricalNode;
+import org.apache.mahout.classifier.df.node.Leaf;
+import org.apache.mahout.classifier.df.node.Node;
+import org.apache.mahout.classifier.df.node.NumericalNode;
+import org.apache.mahout.classifier.df.split.IgSplit;
+import org.apache.mahout.classifier.df.split.OptIgSplit;
+import org.apache.mahout.classifier.df.split.Split;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.Random;
+
+/**
+ * Builds a Decision Tree <br>
+ * Based on the algorithm described in the "Decision Trees" tutorials by Andrew W. Moore, available at:<br>
+ * <br>
+ * http://www.cs.cmu.edu/~awm/tutorials
+ * <br><br>
+ * This class can be used when the criterion variable is the categorical attribute.
+ */
+@Deprecated
+public class DefaultTreeBuilder implements TreeBuilder {
+
+ private static final Logger log = LoggerFactory.getLogger(DefaultTreeBuilder.class);
+
+ private static final int[] NO_ATTRIBUTES = new int[0];
+
+ /**
+ * indicates which CATEGORICAL attributes have already been selected in the parent nodes
+ */
+ private boolean[] selected;
+ /**
+ * number of attributes to select randomly at each node
+ */
+ private int m = 1;
+ /**
+ * IgSplit implementation
+ */
+ private final IgSplit igSplit;
+
+ public DefaultTreeBuilder() {
+ igSplit = new OptIgSplit();
+ }
+
+ public void setM(int m) {
+ this.m = m;
+ }
+
+ @Override
+ public Node build(Random rng, Data data) {
+
+ if (selected == null) {
+ selected = new boolean[data.getDataset().nbAttributes()];
+ selected[data.getDataset().getLabelId()] = true; // never select the label
+ }
+
+ if (data.isEmpty()) {
+ return new Leaf(-1);
+ }
+ if (isIdentical(data)) {
+ return new Leaf(data.majorityLabel(rng));
+ }
+ if (data.identicalLabel()) {
+ return new Leaf(data.getDataset().getLabel(data.get(0)));
+ }
+
+ int[] attributes = randomAttributes(rng, selected, m);
+ if (attributes == null || attributes.length == 0) {
+ // we tried all the attributes and could not split the data anymore
+ return new Leaf(data.majorityLabel(rng));
+ }
+
+ // find the best split
+ Split best = null;
+ for (int attr : attributes) {
+ Split split = igSplit.computeSplit(data, attr);
+ if (best == null || best.getIg() < split.getIg()) {
+ best = split;
+ }
+ }
+
+ boolean alreadySelected = selected[best.getAttr()];
+ if (alreadySelected) {
+ // attribute already selected
+ log.warn("attribute {} already selected in a parent node", best.getAttr());
+ }
+
+ Node childNode;
+ if (data.getDataset().isNumerical(best.getAttr())) {
+ boolean[] temp = null;
+
+ Data loSubset = data.subset(Condition.lesser(best.getAttr(), best.getSplit()));
+ Data hiSubset = data.subset(Condition.greaterOrEquals(best.getAttr(), best.getSplit()));
+
+ if (loSubset.isEmpty() || hiSubset.isEmpty()) {
+ // the selected attribute did not change the data, avoid using it in the child notes
+ selected[best.getAttr()] = true;
+ } else {
+ // the data changed, so we can unselect all previousely selected NUMERICAL attributes
+ temp = selected;
+ selected = cloneCategoricalAttributes(data.getDataset(), selected);
+ }
+
+ Node loChild = build(rng, loSubset);
+ Node hiChild = build(rng, hiSubset);
+
+ // restore the selection state of the attributes
+ if (temp != null) {
+ selected = temp;
+ } else {
+ selected[best.getAttr()] = alreadySelected;
+ }
+
+ childNode = new NumericalNode(best.getAttr(), best.getSplit(), loChild, hiChild);
+ } else { // CATEGORICAL attribute
+ selected[best.getAttr()] = true;
+
+ double[] values = data.values(best.getAttr());
+ Node[] children = new Node[values.length];
+
+ for (int index = 0; index < values.length; index++) {
+ Data subset = data.subset(Condition.equals(best.getAttr(), values[index]));
+ children[index] = build(rng, subset);
+ }
+
+ selected[best.getAttr()] = alreadySelected;
+
+ childNode = new CategoricalNode(best.getAttr(), values, children);
+ }
+
+ return childNode;
+ }
+
+ /**
+ * checks if all the vectors have identical attribute values. Ignore selected attributes.
+ *
+ * @return true is all the vectors are identical or the data is empty<br>
+ * false otherwise
+ */
+ private boolean isIdentical(Data data) {
+ if (data.isEmpty()) {
+ return true;
+ }
+
+ Instance instance = data.get(0);
+ for (int attr = 0; attr < selected.length; attr++) {
+ if (selected[attr]) {
+ continue;
+ }
+
+ for (int index = 1; index < data.size(); index++) {
+ if (data.get(index).get(attr) != instance.get(attr)) {
+ return false;
+ }
+ }
+ }
+
+ return true;
+ }
+
+
+ /**
+ * Make a copy of the selection state of the attributes, unselect all numerical attributes
+ *
+ * @param selected selection state to clone
+ * @return cloned selection state
+ */
+ private static boolean[] cloneCategoricalAttributes(Dataset dataset, boolean[] selected) {
+ boolean[] cloned = new boolean[selected.length];
+
+ for (int i = 0; i < selected.length; i++) {
+ cloned[i] = !dataset.isNumerical(i) && selected[i];
+ }
+
+ return cloned;
+ }
+
+ /**
+ * Randomly selects m attributes to consider for split, excludes IGNORED and LABEL attributes
+ *
+ * @param rng random-numbers generator
+ * @param selected attributes' state (selected or not)
+ * @param m number of attributes to choose
+ * @return list of selected attributes' indices, or null if all attributes have already been selected
+ */
+ protected static int[] randomAttributes(Random rng, boolean[] selected, int m) {
+ int nbNonSelected = 0; // number of non selected attributes
+ for (boolean sel : selected) {
+ if (!sel) {
+ nbNonSelected++;
+ }
+ }
+
+ if (nbNonSelected == 0) {
+ log.warn("All attributes are selected !");
+ return NO_ATTRIBUTES;
+ }
+
+ int[] result;
+ if (nbNonSelected <= m) {
+ // return all non selected attributes
+ result = new int[nbNonSelected];
+ int index = 0;
+ for (int attr = 0; attr < selected.length; attr++) {
+ if (!selected[attr]) {
+ result[index++] = attr;
+ }
+ }
+ } else {
+ result = new int[m];
+ for (int index = 0; index < m; index++) {
+ // randomly choose a "non selected" attribute
+ int rind;
+ do {
+ rind = rng.nextInt(selected.length);
+ } while (selected[rind]);
+
+ result[index] = rind;
+ selected[rind] = true; // temporarily set the chosen attribute to be selected
+ }
+
+ // the chosen attributes are not yet selected
+ for (int attr : result) {
+ selected[attr] = false;
+ }
+ }
+
+ return result;
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/builder/TreeBuilder.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/builder/TreeBuilder.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/builder/TreeBuilder.java
new file mode 100644
index 0000000..bf686a4
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/builder/TreeBuilder.java
@@ -0,0 +1,42 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.builder;
+
+import org.apache.mahout.classifier.df.data.Data;
+import org.apache.mahout.classifier.df.node.Node;
+
+import java.util.Random;
+
+/**
+ * Abstract base class for TreeBuilders
+ */
+@Deprecated
+public interface TreeBuilder {
+
+ /**
+ * Builds a Decision tree using the training data
+ *
+ * @param rng
+ * random-numbers generator
+ * @param data
+ * training data
+ * @return root Node
+ */
+ Node build(Random rng, Data data);
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/Data.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/Data.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/Data.java
new file mode 100644
index 0000000..77e5ed5
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/Data.java
@@ -0,0 +1,281 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.data;
+
+import org.apache.mahout.classifier.df.data.conditions.Condition;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Random;
+
+/**
+ * Holds a list of vectors and their corresponding Dataset. contains various operations that deals with the
+ * vectors (subset, count,...)
+ *
+ */
+@Deprecated
+public class Data implements Cloneable {
+
+ private final List<Instance> instances;
+
+ private final Dataset dataset;
+
+ public Data(Dataset dataset) {
+ this.dataset = dataset;
+ this.instances = new ArrayList<>();
+ }
+
+ public Data(Dataset dataset, List<Instance> instances) {
+ this.dataset = dataset;
+ this.instances = new ArrayList<>(instances);
+ }
+
+ /**
+ * @return the number of elements
+ */
+ public int size() {
+ return instances.size();
+ }
+
+ /**
+ * @return true if this data contains no element
+ */
+ public boolean isEmpty() {
+ return instances.isEmpty();
+ }
+
+ /**
+ * @param v
+ * element whose presence in this list if to be searched
+ * @return true is this data contains the specified element.
+ */
+ public boolean contains(Instance v) {
+ return instances.contains(v);
+ }
+
+ /**
+ * Returns the element at the specified position
+ *
+ * @param index
+ * index of element to return
+ * @return the element at the specified position
+ * @throws IndexOutOfBoundsException
+ * if the index is out of range
+ */
+ public Instance get(int index) {
+ return instances.get(index);
+ }
+
+ /**
+ * @return the subset from this data that matches the given condition
+ */
+ public Data subset(Condition condition) {
+ List<Instance> subset = new ArrayList<>();
+
+ for (Instance instance : instances) {
+ if (condition.isTrueFor(instance)) {
+ subset.add(instance);
+ }
+ }
+
+ return new Data(dataset, subset);
+ }
+
+ /**
+ * if data has N cases, sample N cases at random -but with replacement.
+ */
+ public Data bagging(Random rng) {
+ int datasize = size();
+ List<Instance> bag = new ArrayList<>(datasize);
+
+ for (int i = 0; i < datasize; i++) {
+ bag.add(instances.get(rng.nextInt(datasize)));
+ }
+
+ return new Data(dataset, bag);
+ }
+
+ /**
+ * if data has N cases, sample N cases at random -but with replacement.
+ *
+ * @param sampled
+ * indicating which instance has been sampled
+ *
+ * @return sampled data
+ */
+ public Data bagging(Random rng, boolean[] sampled) {
+ int datasize = size();
+ List<Instance> bag = new ArrayList<>(datasize);
+
+ for (int i = 0; i < datasize; i++) {
+ int index = rng.nextInt(datasize);
+ bag.add(instances.get(index));
+ sampled[index] = true;
+ }
+
+ return new Data(dataset, bag);
+ }
+
+ /**
+ * Splits the data in two, returns one part, and this gets the rest of the data. <b>VERY SLOW!</b>
+ */
+ public Data rsplit(Random rng, int subsize) {
+ List<Instance> subset = new ArrayList<>(subsize);
+
+ for (int i = 0; i < subsize; i++) {
+ subset.add(instances.remove(rng.nextInt(instances.size())));
+ }
+
+ return new Data(dataset, subset);
+ }
+
+ /**
+ * checks if all the vectors have identical attribute values
+ *
+ * @return true is all the vectors are identical or the data is empty<br>
+ * false otherwise
+ */
+ public boolean isIdentical() {
+ if (isEmpty()) {
+ return true;
+ }
+
+ Instance instance = get(0);
+ for (int attr = 0; attr < dataset.nbAttributes(); attr++) {
+ for (int index = 1; index < size(); index++) {
+ if (get(index).get(attr) != instance.get(attr)) {
+ return false;
+ }
+ }
+ }
+
+ return true;
+ }
+
+ /**
+ * checks if all the vectors have identical label values
+ */
+ public boolean identicalLabel() {
+ if (isEmpty()) {
+ return true;
+ }
+
+ double label = dataset.getLabel(get(0));
+ for (int index = 1; index < size(); index++) {
+ if (dataset.getLabel(get(index)) != label) {
+ return false;
+ }
+ }
+
+ return true;
+ }
+
+ /**
+ * finds all distinct values of a given attribute
+ */
+ public double[] values(int attr) {
+ Collection<Double> result = new HashSet<>();
+
+ for (Instance instance : instances) {
+ result.add(instance.get(attr));
+ }
+
+ double[] values = new double[result.size()];
+
+ int index = 0;
+ for (Double value : result) {
+ values[index++] = value;
+ }
+
+ return values;
+ }
+
+ @Override
+ public Data clone() {
+ return new Data(dataset, new ArrayList<>(instances));
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj) {
+ return true;
+ }
+ if (!(obj instanceof Data)) {
+ return false;
+ }
+
+ Data data = (Data) obj;
+
+ return instances.equals(data.instances) && dataset.equals(data.dataset);
+ }
+
+ @Override
+ public int hashCode() {
+ return instances.hashCode() + dataset.hashCode();
+ }
+
+ /**
+ * extract the labels of all instances
+ */
+ public double[] extractLabels() {
+ double[] labels = new double[size()];
+
+ for (int index = 0; index < labels.length; index++) {
+ labels[index] = dataset.getLabel(get(index));
+ }
+
+ return labels;
+ }
+
+ /**
+ * finds the majority label, breaking ties randomly<br>
+ * This method can be used when the criterion variable is the categorical attribute.
+ *
+ * @return the majority label value
+ */
+ public int majorityLabel(Random rng) {
+ // count the frequency of each label value
+ int[] counts = new int[dataset.nblabels()];
+
+ for (int index = 0; index < size(); index++) {
+ counts[(int) dataset.getLabel(get(index))]++;
+ }
+
+ // find the label values that appears the most
+ return DataUtils.maxindex(rng, counts);
+ }
+
+ /**
+ * Counts the number of occurrences of each label value<br>
+ * This method can be used when the criterion variable is the categorical attribute.
+ *
+ * @param counts
+ * will contain the results, supposed to be initialized at 0
+ */
+ public void countLabels(int[] counts) {
+ for (int index = 0; index < size(); index++) {
+ counts[(int) dataset.getLabel(get(index))]++;
+ }
+ }
+
+ public Dataset getDataset() {
+ return dataset;
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/DataConverter.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/DataConverter.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/DataConverter.java
new file mode 100644
index 0000000..f1bdc95
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/DataConverter.java
@@ -0,0 +1,72 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.data;
+
+import com.google.common.base.Preconditions;
+import org.apache.commons.lang3.ArrayUtils;
+import org.apache.mahout.math.DenseVector;
+
+import java.util.regex.Pattern;
+
+/**
+ * Converts String to Instance using a Dataset
+ */
+@Deprecated
+public class DataConverter {
+
+ private static final Pattern COMMA_SPACE = Pattern.compile("[, ]");
+
+ private final Dataset dataset;
+
+ public DataConverter(Dataset dataset) {
+ this.dataset = dataset;
+ }
+
+ public Instance convert(CharSequence string) {
+ // all attributes (categorical, numerical, label), ignored
+ int nball = dataset.nbAttributes() + dataset.getIgnored().length;
+
+ String[] tokens = COMMA_SPACE.split(string);
+ Preconditions.checkArgument(tokens.length == nball,
+ "Wrong number of attributes in the string: " + tokens.length + ". Must be " + nball);
+
+ int nbattrs = dataset.nbAttributes();
+ DenseVector vector = new DenseVector(nbattrs);
+
+ int aId = 0;
+ for (int attr = 0; attr < nball; attr++) {
+ if (!ArrayUtils.contains(dataset.getIgnored(), attr)) {
+ String token = tokens[attr].trim();
+
+ if ("?".equals(token)) {
+ // missing value
+ return null;
+ }
+
+ if (dataset.isNumerical(aId)) {
+ vector.set(aId++, Double.parseDouble(token));
+ } else { // CATEGORICAL
+ vector.set(aId, dataset.valueOf(aId, token));
+ aId++;
+ }
+ }
+ }
+
+ return new Instance(vector);
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/DataLoader.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/DataLoader.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/DataLoader.java
new file mode 100644
index 0000000..c62dcac
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/DataLoader.java
@@ -0,0 +1,255 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.data;
+
+import com.google.common.base.Preconditions;
+import com.google.common.collect.Lists;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.mahout.classifier.df.data.Dataset.Attribute;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Scanner;
+import java.util.Set;
+import java.util.regex.Pattern;
+
+/**
+ * Converts the input data to a Vector Array using the information given by the Dataset.<br>
+ * Generates for each line a Vector that contains :<br>
+ * <ul>
+ * <li>double parsed value for NUMERICAL attributes</li>
+ * <li>int value for CATEGORICAL and LABEL attributes</li>
+ * </ul>
+ * <br>
+ * adds an IGNORED first attribute that will contain a unique id for each instance, which is the line number
+ * of the instance in the input data
+ */
+@Deprecated
+public final class DataLoader {
+
+ private static final Logger log = LoggerFactory.getLogger(DataLoader.class);
+
+ private static final Pattern SEPARATORS = Pattern.compile("[, ]");
+
+ private DataLoader() {}
+
+ /**
+ * Converts a comma-separated String to a Vector.
+ *
+ * @param attrs
+ * attributes description
+ * @param values
+ * used to convert CATEGORICAL attribute values to Integer
+ * @return false if there are missing values '?' or NUMERICAL attribute values is not numeric
+ */
+ private static boolean parseString(Attribute[] attrs, Set<String>[] values, CharSequence string,
+ boolean regression) {
+ String[] tokens = SEPARATORS.split(string);
+ Preconditions.checkArgument(tokens.length == attrs.length,
+ "Wrong number of attributes in the string: " + tokens.length + ". Must be: " + attrs.length);
+
+ // extract tokens and check is there is any missing value
+ for (int attr = 0; attr < attrs.length; attr++) {
+ if (!attrs[attr].isIgnored() && "?".equals(tokens[attr])) {
+ return false; // missing value
+ }
+ }
+
+ for (int attr = 0; attr < attrs.length; attr++) {
+ if (!attrs[attr].isIgnored()) {
+ String token = tokens[attr];
+ if (attrs[attr].isCategorical() || (!regression && attrs[attr].isLabel())) {
+ // update values
+ if (values[attr] == null) {
+ values[attr] = new HashSet<>();
+ }
+ values[attr].add(token);
+ } else {
+ try {
+ Double.parseDouble(token);
+ } catch (NumberFormatException e) {
+ return false;
+ }
+ }
+ }
+ }
+
+ return true;
+ }
+
+ /**
+ * Loads the data from a file
+ *
+ * @param fs
+ * file system
+ * @param fpath
+ * data file path
+ * @throws IOException
+ * if any problem is encountered
+ */
+
+ public static Data loadData(Dataset dataset, FileSystem fs, Path fpath) throws IOException {
+ FSDataInputStream input = fs.open(fpath);
+ Scanner scanner = new Scanner(input, "UTF-8");
+
+ List<Instance> instances = new ArrayList<>();
+
+ DataConverter converter = new DataConverter(dataset);
+
+ while (scanner.hasNextLine()) {
+ String line = scanner.nextLine();
+ if (!line.isEmpty()) {
+ Instance instance = converter.convert(line);
+ if (instance != null) {
+ instances.add(instance);
+ } else {
+ // missing values found
+ log.warn("{}: missing values", instances.size());
+ }
+ } else {
+ log.warn("{}: empty string", instances.size());
+ }
+ }
+
+ scanner.close();
+ return new Data(dataset, instances);
+ }
+
+
+ /** Loads the data from multiple paths specified by pathes */
+ public static Data loadData(Dataset dataset, FileSystem fs, Path[] pathes) throws IOException {
+ List<Instance> instances = new ArrayList<>();
+
+ for (Path path : pathes) {
+ Data loadedData = loadData(dataset, fs, path);
+ for (int index = 0; index <= loadedData.size(); index++) {
+ instances.add(loadedData.get(index));
+ }
+ }
+ return new Data(dataset, instances);
+ }
+
+ /** Loads the data from a String array */
+ public static Data loadData(Dataset dataset, String[] data) {
+ List<Instance> instances = new ArrayList<>();
+
+ DataConverter converter = new DataConverter(dataset);
+
+ for (String line : data) {
+ if (!line.isEmpty()) {
+ Instance instance = converter.convert(line);
+ if (instance != null) {
+ instances.add(instance);
+ } else {
+ // missing values found
+ log.warn("{}: missing values", instances.size());
+ }
+ } else {
+ log.warn("{}: empty string", instances.size());
+ }
+ }
+
+ return new Data(dataset, instances);
+ }
+
+ /**
+ * Generates the Dataset by parsing the entire data
+ *
+ * @param descriptor attributes description
+ * @param regression if true, the label is numerical
+ * @param fs file system
+ * @param path data path
+ */
+ public static Dataset generateDataset(CharSequence descriptor,
+ boolean regression,
+ FileSystem fs,
+ Path path) throws DescriptorException, IOException {
+ Attribute[] attrs = DescriptorUtils.parseDescriptor(descriptor);
+
+ FSDataInputStream input = fs.open(path);
+ Scanner scanner = new Scanner(input, "UTF-8");
+
+ // used to convert CATEGORICAL attribute to Integer
+ @SuppressWarnings("unchecked")
+ Set<String>[] valsets = new Set[attrs.length];
+
+ int size = 0;
+ while (scanner.hasNextLine()) {
+ String line = scanner.nextLine();
+ if (!line.isEmpty()) {
+ if (parseString(attrs, valsets, line, regression)) {
+ size++;
+ }
+ }
+ }
+
+ scanner.close();
+
+ @SuppressWarnings("unchecked")
+ List<String>[] values = new List[attrs.length];
+ for (int i = 0; i < valsets.length; i++) {
+ if (valsets[i] != null) {
+ values[i] = Lists.newArrayList(valsets[i]);
+ }
+ }
+
+ return new Dataset(attrs, values, size, regression);
+ }
+
+ /**
+ * Generates the Dataset by parsing the entire data
+ *
+ * @param descriptor
+ * attributes description
+ */
+ public static Dataset generateDataset(CharSequence descriptor,
+ boolean regression,
+ String[] data) throws DescriptorException {
+ Attribute[] attrs = DescriptorUtils.parseDescriptor(descriptor);
+
+ // used to convert CATEGORICAL attributes to Integer
+ @SuppressWarnings("unchecked")
+ Set<String>[] valsets = new Set[attrs.length];
+
+ int size = 0;
+ for (String aData : data) {
+ if (!aData.isEmpty()) {
+ if (parseString(attrs, valsets, aData, regression)) {
+ size++;
+ }
+ }
+ }
+
+ @SuppressWarnings("unchecked")
+ List<String>[] values = new List[attrs.length];
+ for (int i = 0; i < valsets.length; i++) {
+ if (valsets[i] != null) {
+ values[i] = Lists.newArrayList(valsets[i]);
+ }
+ }
+
+ return new Dataset(attrs, values, size, regression);
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/DataUtils.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/DataUtils.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/DataUtils.java
new file mode 100644
index 0000000..0889370
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/DataUtils.java
@@ -0,0 +1,89 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.data;
+
+import com.google.common.base.Preconditions;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Random;
+
+/**
+ * Helper methods that deals with data lists and arrays of values
+ */
+@Deprecated
+public final class DataUtils {
+ private DataUtils() { }
+
+ /**
+ * Computes the sum of the values
+ *
+ */
+ public static int sum(int[] values) {
+ int sum = 0;
+ for (int value : values) {
+ sum += value;
+ }
+
+ return sum;
+ }
+
+ /**
+ * foreach i : array1[i] += array2[i]
+ */
+ public static void add(int[] array1, int[] array2) {
+ Preconditions.checkArgument(array1.length == array2.length, "array1.length != array2.length");
+ for (int index = 0; index < array1.length; index++) {
+ array1[index] += array2[index];
+ }
+ }
+
+ /**
+ * foreach i : array1[i] -= array2[i]
+ */
+ public static void dec(int[] array1, int[] array2) {
+ Preconditions.checkArgument(array1.length == array2.length, "array1.length != array2.length");
+ for (int index = 0; index < array1.length; index++) {
+ array1[index] -= array2[index];
+ }
+ }
+
+ /**
+ * return the index of the maximum of the array, breaking ties randomly
+ *
+ * @param rng
+ * used to break ties
+ * @return index of the maximum
+ */
+ public static int maxindex(Random rng, int[] values) {
+ int max = 0;
+ List<Integer> maxindices = new ArrayList<>();
+
+ for (int index = 0; index < values.length; index++) {
+ if (values[index] > max) {
+ max = values[index];
+ maxindices.clear();
+ maxindices.add(index);
+ } else if (values[index] == max) {
+ maxindices.add(index);
+ }
+ }
+
+ return maxindices.size() > 1 ? maxindices.get(rng.nextInt(maxindices.size())) : maxindices.get(0);
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/Dataset.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/Dataset.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/Dataset.java
new file mode 100644
index 0000000..a392669
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/Dataset.java
@@ -0,0 +1,422 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.data;
+
+import com.google.common.base.Preconditions;
+import com.google.common.io.Closeables;
+import org.apache.commons.lang3.ArrayUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.codehaus.jackson.map.ObjectMapper;
+import org.codehaus.jackson.type.TypeReference;
+
+import java.io.IOException;
+import java.nio.charset.Charset;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+
+/**
+ * Contains information about the attributes.
+ */
+@Deprecated
+public class Dataset {
+
+ /**
+ * Attributes type
+ */
+ public enum Attribute {
+ IGNORED,
+ NUMERICAL,
+ CATEGORICAL,
+ LABEL;
+
+ public boolean isNumerical() {
+ return this == NUMERICAL;
+ }
+
+ public boolean isCategorical() {
+ return this == CATEGORICAL;
+ }
+
+ public boolean isLabel() {
+ return this == LABEL;
+ }
+
+ public boolean isIgnored() {
+ return this == IGNORED;
+ }
+
+ private static Attribute fromString(String from) {
+ Attribute toReturn = LABEL;
+ if (NUMERICAL.toString().equalsIgnoreCase(from)) {
+ toReturn = NUMERICAL;
+ } else if (CATEGORICAL.toString().equalsIgnoreCase(from)) {
+ toReturn = CATEGORICAL;
+ } else if (IGNORED.toString().equalsIgnoreCase(from)) {
+ toReturn = IGNORED;
+ }
+ return toReturn;
+ }
+ }
+
+ private Attribute[] attributes;
+
+ /**
+ * list of ignored attributes
+ */
+ private int[] ignored;
+
+ /**
+ * distinct values (CATEGORIAL attributes only)
+ */
+ private String[][] values;
+
+ /**
+ * index of the label attribute in the loaded data (without ignored attributed)
+ */
+ private int labelId;
+
+ /**
+ * number of instances in the dataset
+ */
+ private int nbInstances;
+
+ /** JSON serial/de-serial-izer */
+ private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
+ // Some literals for JSON representation
+ static final String TYPE = "type";
+ static final String VALUES = "values";
+ static final String LABEL = "label";
+
+ protected Dataset() {}
+
+ /**
+ * Should only be called by a DataLoader
+ *
+ * @param attrs attributes description
+ * @param values distinct values for all CATEGORICAL attributes
+ */
+ Dataset(Attribute[] attrs, List<String>[] values, int nbInstances, boolean regression) {
+ validateValues(attrs, values);
+
+ int nbattrs = countAttributes(attrs);
+
+ // the label values are set apart
+ attributes = new Attribute[nbattrs];
+ this.values = new String[nbattrs][];
+ ignored = new int[attrs.length - nbattrs]; // nbignored = total - nbattrs
+
+ labelId = -1;
+ int ignoredId = 0;
+ int ind = 0;
+ for (int attr = 0; attr < attrs.length; attr++) {
+ if (attrs[attr].isIgnored()) {
+ ignored[ignoredId++] = attr;
+ continue;
+ }
+
+ if (attrs[attr].isLabel()) {
+ if (labelId != -1) {
+ throw new IllegalStateException("Label found more than once");
+ }
+ labelId = ind;
+ if (regression) {
+ attrs[attr] = Attribute.NUMERICAL;
+ } else {
+ attrs[attr] = Attribute.CATEGORICAL;
+ }
+ }
+
+ if (attrs[attr].isCategorical() || (!regression && attrs[attr].isLabel())) {
+ this.values[ind] = new String[values[attr].size()];
+ values[attr].toArray(this.values[ind]);
+ }
+
+ attributes[ind++] = attrs[attr];
+ }
+
+ if (labelId == -1) {
+ throw new IllegalStateException("Label not found");
+ }
+
+ this.nbInstances = nbInstances;
+ }
+
+ public int nbValues(int attr) {
+ return values[attr].length;
+ }
+
+ public String[] labels() {
+ return Arrays.copyOf(values[labelId], nblabels());
+ }
+
+ public int nblabels() {
+ return values[labelId].length;
+ }
+
+ public int getLabelId() {
+ return labelId;
+ }
+
+ public double getLabel(Instance instance) {
+ return instance.get(getLabelId());
+ }
+
+ public Attribute getAttribute(int attr) {
+ return attributes[attr];
+ }
+
+ /**
+ * Returns the code used to represent the label value in the data
+ *
+ * @param label label's value to code
+ * @return label's code
+ */
+ public int labelCode(String label) {
+ return ArrayUtils.indexOf(values[labelId], label);
+ }
+
+ /**
+ * Returns the label value in the data
+ * This method can be used when the criterion variable is the categorical attribute.
+ *
+ * @param code label's code
+ * @return label's value
+ */
+ public String getLabelString(double code) {
+ // handle the case (prediction is NaN)
+ if (Double.isNaN(code)) {
+ return "unknown";
+ }
+ return values[labelId][(int) code];
+ }
+
+ @Override
+ public String toString() {
+ return "attributes=" + Arrays.toString(attributes);
+ }
+
+ /**
+ * Converts a token to its corresponding integer code for a given attribute
+ *
+ * @param attr attribute index
+ */
+ public int valueOf(int attr, String token) {
+ Preconditions.checkArgument(!isNumerical(attr), "Only for CATEGORICAL attributes");
+ Preconditions.checkArgument(values != null, "Values not found (equals null)");
+ return ArrayUtils.indexOf(values[attr], token);
+ }
+
+ public int[] getIgnored() {
+ return ignored;
+ }
+
+ /**
+ * @return number of attributes that are not IGNORED
+ */
+ private static int countAttributes(Attribute[] attrs) {
+ int nbattrs = 0;
+ for (Attribute attr : attrs) {
+ if (!attr.isIgnored()) {
+ nbattrs++;
+ }
+ }
+ return nbattrs;
+ }
+
+ private static void validateValues(Attribute[] attrs, List<String>[] values) {
+ Preconditions.checkArgument(attrs.length == values.length, "attrs.length != values.length");
+ for (int attr = 0; attr < attrs.length; attr++) {
+ Preconditions.checkArgument(!attrs[attr].isCategorical() || values[attr] != null,
+ "values not found for attribute " + attr);
+ }
+ }
+
+ /**
+ * @return number of attributes
+ */
+ public int nbAttributes() {
+ return attributes.length;
+ }
+
+ /**
+ * Is this a numerical attribute ?
+ *
+ * @param attr index of the attribute to check
+ * @return true if the attribute is numerical
+ */
+ public boolean isNumerical(int attr) {
+ return attributes[attr].isNumerical();
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj) {
+ return true;
+ }
+ if (!(obj instanceof Dataset)) {
+ return false;
+ }
+
+ Dataset dataset = (Dataset) obj;
+
+ if (!Arrays.equals(attributes, dataset.attributes)) {
+ return false;
+ }
+
+ for (int attr = 0; attr < nbAttributes(); attr++) {
+ if (!Arrays.equals(values[attr], dataset.values[attr])) {
+ return false;
+ }
+ }
+
+ return labelId == dataset.labelId && nbInstances == dataset.nbInstances;
+ }
+
+ @Override
+ public int hashCode() {
+ int hashCode = labelId + 31 * nbInstances;
+ for (Attribute attr : attributes) {
+ hashCode = 31 * hashCode + attr.hashCode();
+ }
+ for (String[] valueRow : values) {
+ if (valueRow == null) {
+ continue;
+ }
+ for (String value : valueRow) {
+ hashCode = 31 * hashCode + value.hashCode();
+ }
+ }
+ return hashCode;
+ }
+
+ /**
+ * Loads the dataset from a file
+ *
+ * @throws java.io.IOException
+ */
+ public static Dataset load(Configuration conf, Path path) throws IOException {
+ FileSystem fs = path.getFileSystem(conf);
+ long bytesToRead = fs.getFileStatus(path).getLen();
+ byte[] buff = new byte[Long.valueOf(bytesToRead).intValue()];
+ FSDataInputStream input = fs.open(path);
+ try {
+ input.readFully(buff);
+ } finally {
+ Closeables.close(input, true);
+ }
+ String json = new String(buff, Charset.defaultCharset());
+ return fromJSON(json);
+ }
+
+
+ /**
+ * Serialize this instance to JSON
+ * @return some JSON
+ */
+ public String toJSON() {
+ List<Map<String, Object>> toWrite = new LinkedList<>();
+ // attributes does not include ignored columns and it does include the class label
+ int ignoredCount = 0;
+ for (int i = 0; i < attributes.length + ignored.length; i++) {
+ Map<String, Object> attribute;
+ int attributesIndex = i - ignoredCount;
+ if (ignoredCount < ignored.length && i == ignored[ignoredCount]) {
+ // fill in ignored atttribute
+ attribute = getMap(Attribute.IGNORED, null, false);
+ ignoredCount++;
+ } else if (attributesIndex == labelId) {
+ // fill in the label
+ attribute = getMap(attributes[attributesIndex], values[attributesIndex], true);
+ } else {
+ // normal attribute
+ attribute = getMap(attributes[attributesIndex], values[attributesIndex], false);
+ }
+ toWrite.add(attribute);
+ }
+ try {
+ return OBJECT_MAPPER.writeValueAsString(toWrite);
+ } catch (Exception ex) {
+ throw new RuntimeException(ex);
+ }
+ }
+
+ /**
+ * De-serialize an instance from a string
+ * @param json From which an instance is created
+ * @return A shiny new Dataset
+ */
+ public static Dataset fromJSON(String json) {
+ List<Map<String, Object>> fromJSON;
+ try {
+ fromJSON = OBJECT_MAPPER.readValue(json, new TypeReference<List<Map<String, Object>>>() {});
+ } catch (Exception ex) {
+ throw new RuntimeException(ex);
+ }
+ List<Attribute> attributes = new LinkedList<>();
+ List<Integer> ignored = new LinkedList<>();
+ String[][] nominalValues = new String[fromJSON.size()][];
+ Dataset dataset = new Dataset();
+ for (int i = 0; i < fromJSON.size(); i++) {
+ Map<String, Object> attribute = fromJSON.get(i);
+ if (Attribute.fromString((String) attribute.get(TYPE)) == Attribute.IGNORED) {
+ ignored.add(i);
+ } else {
+ Attribute asAttribute = Attribute.fromString((String) attribute.get(TYPE));
+ attributes.add(asAttribute);
+ if ((Boolean) attribute.get(LABEL)) {
+ dataset.labelId = i - ignored.size();
+ }
+ if (attribute.get(VALUES) != null) {
+ List<String> get = (List<String>) attribute.get(VALUES);
+ String[] array = get.toArray(new String[get.size()]);
+ nominalValues[i - ignored.size()] = array;
+ }
+ }
+ }
+ dataset.attributes = attributes.toArray(new Attribute[attributes.size()]);
+ dataset.ignored = new int[ignored.size()];
+ dataset.values = nominalValues;
+ for (int i = 0; i < dataset.ignored.length; i++) {
+ dataset.ignored[i] = ignored.get(i);
+ }
+ return dataset;
+ }
+
+ /**
+ * Generate a map to describe an attribute
+ * @param type The type
+ * @param values - values
+ * @param isLabel - is a label
+ * @return map of (AttributeTypes, Values)
+ */
+ private Map<String, Object> getMap(Attribute type, String[] values, boolean isLabel) {
+ Map<String, Object> attribute = new HashMap<>();
+ attribute.put(TYPE, type.toString().toLowerCase(Locale.getDefault()));
+ attribute.put(VALUES, values);
+ attribute.put(LABEL, isLabel);
+ return attribute;
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/DescriptorException.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/DescriptorException.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/DescriptorException.java
new file mode 100644
index 0000000..e7a10ff
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/DescriptorException.java
@@ -0,0 +1,28 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.data;
+
+/**
+ * Exception thrown when parsing a descriptor
+ */
+@Deprecated
+public class DescriptorException extends Exception {
+ public DescriptorException(String msg) {
+ super(msg);
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/DescriptorUtils.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/DescriptorUtils.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/DescriptorUtils.java
new file mode 100644
index 0000000..aadedbd
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/DescriptorUtils.java
@@ -0,0 +1,110 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.data;
+
+import com.google.common.base.Splitter;
+import org.apache.mahout.classifier.df.data.Dataset.Attribute;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Locale;
+
+/**
+ * Contains various methods that deal with descriptor strings
+ */
+@Deprecated
+public final class DescriptorUtils {
+
+ private static final Splitter SPACE = Splitter.on(' ').omitEmptyStrings();
+
+ private DescriptorUtils() { }
+
+ /**
+ * Parses a descriptor string and generates the corresponding array of Attributes
+ *
+ * @throws DescriptorException
+ * if a bad token is encountered
+ */
+ public static Attribute[] parseDescriptor(CharSequence descriptor) throws DescriptorException {
+ List<Attribute> attributes = new ArrayList<>();
+ for (String token : SPACE.split(descriptor)) {
+ token = token.toUpperCase(Locale.ENGLISH);
+ if ("I".equals(token)) {
+ attributes.add(Attribute.IGNORED);
+ } else if ("N".equals(token)) {
+ attributes.add(Attribute.NUMERICAL);
+ } else if ("C".equals(token)) {
+ attributes.add(Attribute.CATEGORICAL);
+ } else if ("L".equals(token)) {
+ attributes.add(Attribute.LABEL);
+ } else {
+ throw new DescriptorException("Bad Token : " + token);
+ }
+ }
+ return attributes.toArray(new Attribute[attributes.size()]);
+ }
+
+ /**
+ * Generates a valid descriptor string from a user-friendly representation.<br>
+ * for example "3 N I N N 2 C L 5 I" generates "N N N I N N C C L I I I I I".<br>
+ * this useful when describing datasets with a large number of attributes
+ * @throws DescriptorException
+ */
+ public static String generateDescriptor(CharSequence description) throws DescriptorException {
+ return generateDescriptor(SPACE.split(description));
+ }
+
+ /**
+ * Generates a valid descriptor string from a list of tokens
+ * @throws DescriptorException
+ */
+ public static String generateDescriptor(Iterable<String> tokens) throws DescriptorException {
+ StringBuilder descriptor = new StringBuilder();
+
+ int multiplicator = 0;
+
+ for (String token : tokens) {
+ try {
+ // try to parse an integer
+ int number = Integer.parseInt(token);
+
+ if (number <= 0) {
+ throw new DescriptorException("Multiplicator (" + number + ") must be > 0");
+ }
+ if (multiplicator > 0) {
+ throw new DescriptorException("A multiplicator cannot be followed by another multiplicator");
+ }
+
+ multiplicator = number;
+ } catch (NumberFormatException e) {
+ // token is not a number
+ if (multiplicator == 0) {
+ multiplicator = 1;
+ }
+
+ for (int index = 0; index < multiplicator; index++) {
+ descriptor.append(token).append(' ');
+ }
+
+ multiplicator = 0;
+ }
+ }
+
+ return descriptor.toString().trim();
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/Instance.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/Instance.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/Instance.java
new file mode 100644
index 0000000..6a23cb8
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/Instance.java
@@ -0,0 +1,75 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.data;
+
+import org.apache.mahout.math.Vector;
+
+/**
+ * Represents one data instance.
+ */
+@Deprecated
+public class Instance {
+
+ /** attributes, except LABEL and IGNORED */
+ private final Vector attrs;
+
+ public Instance(Vector attrs) {
+ this.attrs = attrs;
+ }
+
+ /**
+ * Return the attribute at the specified position
+ *
+ * @param index
+ * position of the attribute to retrieve
+ * @return value of the attribute
+ */
+ public double get(int index) {
+ return attrs.getQuick(index);
+ }
+
+ /**
+ * Set the value at the given index
+ *
+ * @param value
+ * a double value to set
+ */
+ public void set(int index, double value) {
+ attrs.set(index, value);
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj) {
+ return true;
+ }
+ if (!(obj instanceof Instance)) {
+ return false;
+ }
+
+ Instance instance = (Instance) obj;
+
+ return /*id == instance.id &&*/ attrs.equals(instance.attrs);
+
+ }
+
+ @Override
+ public int hashCode() {
+ return /*id +*/ attrs.hashCode();
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/conditions/Condition.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/conditions/Condition.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/conditions/Condition.java
new file mode 100644
index 0000000..c16ca3f
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/conditions/Condition.java
@@ -0,0 +1,57 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.data.conditions;
+
+import org.apache.mahout.classifier.df.data.Instance;
+
+/**
+ * Condition on Instance
+ */
+@Deprecated
+public abstract class Condition {
+
+ /**
+ * Returns true is the checked instance matches the condition
+ *
+ * @param instance
+ * checked instance
+ * @return true is the checked instance matches the condition
+ */
+ public abstract boolean isTrueFor(Instance instance);
+
+ /**
+ * Condition that checks if the given attribute has a value "equal" to the given value
+ */
+ public static Condition equals(int attr, double value) {
+ return new Equals(attr, value);
+ }
+
+ /**
+ * Condition that checks if the given attribute has a value "lesser" than the given value
+ */
+ public static Condition lesser(int attr, double value) {
+ return new Lesser(attr, value);
+ }
+
+ /**
+ * Condition that checks if the given attribute has a value "greater or equal" than the given value
+ */
+ public static Condition greaterOrEquals(int attr, double value) {
+ return new GreaterOrEquals(attr, value);
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/conditions/Equals.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/conditions/Equals.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/conditions/Equals.java
new file mode 100644
index 0000000..c51082b
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/conditions/Equals.java
@@ -0,0 +1,42 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.data.conditions;
+
+import org.apache.mahout.classifier.df.data.Instance;
+
+/**
+ * True if a given attribute has a given value
+ */
+@Deprecated
+public class Equals extends Condition {
+
+ private final int attr;
+
+ private final double value;
+
+ public Equals(int attr, double value) {
+ this.attr = attr;
+ this.value = value;
+ }
+
+ @Override
+ public boolean isTrueFor(Instance instance) {
+ return instance.get(attr) == value;
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/conditions/GreaterOrEquals.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/conditions/GreaterOrEquals.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/conditions/GreaterOrEquals.java
new file mode 100644
index 0000000..3e3d1a4
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/conditions/GreaterOrEquals.java
@@ -0,0 +1,42 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.data.conditions;
+
+import org.apache.mahout.classifier.df.data.Instance;
+
+/**
+ * True if a given attribute has a value "greater or equal" than a given value
+ */
+@Deprecated
+public class GreaterOrEquals extends Condition {
+
+ private final int attr;
+
+ private final double value;
+
+ public GreaterOrEquals(int attr, double value) {
+ this.attr = attr;
+ this.value = value;
+ }
+
+ @Override
+ public boolean isTrueFor(Instance v) {
+ return v.get(attr) >= value;
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/conditions/Lesser.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/conditions/Lesser.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/conditions/Lesser.java
new file mode 100644
index 0000000..577cb24
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/data/conditions/Lesser.java
@@ -0,0 +1,42 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df.data.conditions;
+
+import org.apache.mahout.classifier.df.data.Instance;
+
+/**
+ * True if a given attribute has a value "lesser" than a given value
+ */
+@Deprecated
+public class Lesser extends Condition {
+
+ private final int attr;
+
+ private final double value;
+
+ public Lesser(int attr, double value) {
+ this.attr = attr;
+ this.value = value;
+ }
+
+ @Override
+ public boolean isTrueFor(Instance instance) {
+ return instance.get(attr) < value;
+ }
+
+}
r***@apache.org
2018-06-28 14:54:53 UTC
Permalink
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/SolveImplicitFeedbackMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/SolveImplicitFeedbackMapper.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/SolveImplicitFeedbackMapper.java
new file mode 100644
index 0000000..fd6657f
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/SolveImplicitFeedbackMapper.java
@@ -0,0 +1,58 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.als;
+
+import com.google.common.base.Preconditions;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.mahout.math.VectorWritable;
+import org.apache.mahout.math.als.ImplicitFeedbackAlternatingLeastSquaresSolver;
+
+import java.io.IOException;
+
+/** Solving mapper that can be safely executed using multiple threads */
+public class SolveImplicitFeedbackMapper
+ extends SharingMapper<IntWritable,VectorWritable,IntWritable,VectorWritable,
+ ImplicitFeedbackAlternatingLeastSquaresSolver> {
+
+ private final VectorWritable uiOrmj = new VectorWritable();
+
+ @Override
+ ImplicitFeedbackAlternatingLeastSquaresSolver createSharedInstance(Context ctx) throws IOException {
+ Configuration conf = ctx.getConfiguration();
+
+ double lambda = Double.parseDouble(conf.get(ParallelALSFactorizationJob.LAMBDA));
+ double alpha = Double.parseDouble(conf.get(ParallelALSFactorizationJob.ALPHA));
+ int numFeatures = conf.getInt(ParallelALSFactorizationJob.NUM_FEATURES, -1);
+ int numEntities = Integer.parseInt(conf.get(ParallelALSFactorizationJob.NUM_ENTITIES));
+
+ Preconditions.checkArgument(numFeatures > 0, "numFeatures must be greater then 0!");
+
+ return new ImplicitFeedbackAlternatingLeastSquaresSolver(numFeatures, lambda, alpha,
+ ALS.readMatrixByRowsFromDistributedCache(numEntities, conf), 1);
+ }
+
+ @Override
+ protected void map(IntWritable userOrItemID, VectorWritable ratingsWritable, Context ctx)
+ throws IOException, InterruptedException {
+ ImplicitFeedbackAlternatingLeastSquaresSolver solver = getSharedInstance();
+ uiOrmj.set(solver.solve(ratingsWritable.get()));
+ ctx.write(userOrItemID, uiOrmj);
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/AggregateAndRecommendReducer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/AggregateAndRecommendReducer.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/AggregateAndRecommendReducer.java
new file mode 100644
index 0000000..b44fd5b
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/AggregateAndRecommendReducer.java
@@ -0,0 +1,220 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.item;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.mahout.cf.taste.hadoop.MutableRecommendedItem;
+import org.apache.mahout.cf.taste.hadoop.RecommendedItemsWritable;
+import org.apache.mahout.cf.taste.hadoop.TasteHadoopUtils;
+import org.apache.mahout.cf.taste.hadoop.TopItemsQueue;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.recommender.RecommendedItem;
+import org.apache.mahout.math.RandomAccessSparseVector;
+import org.apache.mahout.math.VarLongWritable;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.Vector.Element;
+import org.apache.mahout.math.function.Functions;
+import org.apache.mahout.math.map.OpenIntLongHashMap;
+
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.List;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * <p>computes prediction values for each user</p>
+ *
+ * <pre>
+ * u = a user
+ * i = an item not yet rated by u
+ * N = all items similar to i (where similarity is usually computed by pairwisely comparing the item-vectors
+ * of the user-item matrix)
+ *
+ * Prediction(u,i) = sum(all n from N: similarity(i,n) * rating(u,n)) / sum(all n from N: abs(similarity(i,n)))
+ * </pre>
+ */
+public final class AggregateAndRecommendReducer extends
+ Reducer<VarLongWritable,PrefAndSimilarityColumnWritable,VarLongWritable,RecommendedItemsWritable> {
+
+ private static final Logger log = LoggerFactory.getLogger(AggregateAndRecommendReducer.class);
+
+ static final String ITEMID_INDEX_PATH = "itemIDIndexPath";
+ static final String NUM_RECOMMENDATIONS = "numRecommendations";
+ static final int DEFAULT_NUM_RECOMMENDATIONS = 10;
+ static final String ITEMS_FILE = "itemsFile";
+
+ private boolean booleanData;
+ private int recommendationsPerUser;
+ private IDReader idReader;
+ private FastIDSet itemsToRecommendFor;
+ private OpenIntLongHashMap indexItemIDMap;
+
+ private final RecommendedItemsWritable recommendedItems = new RecommendedItemsWritable();
+
+ private static final float BOOLEAN_PREF_VALUE = 1.0f;
+
+ @Override
+ protected void setup(Context context) throws IOException {
+ Configuration conf = context.getConfiguration();
+ recommendationsPerUser = conf.getInt(NUM_RECOMMENDATIONS, DEFAULT_NUM_RECOMMENDATIONS);
+ booleanData = conf.getBoolean(RecommenderJob.BOOLEAN_DATA, false);
+ indexItemIDMap = TasteHadoopUtils.readIDIndexMap(conf.get(ITEMID_INDEX_PATH), conf);
+
+ idReader = new IDReader(conf);
+ idReader.readIDs();
+ itemsToRecommendFor = idReader.getItemIds();
+ }
+
+ @Override
+ protected void reduce(VarLongWritable userID,
+ Iterable<PrefAndSimilarityColumnWritable> values,
+ Context context) throws IOException, InterruptedException {
+ if (booleanData) {
+ reduceBooleanData(userID, values, context);
+ } else {
+ reduceNonBooleanData(userID, values, context);
+ }
+ }
+
+ private void reduceBooleanData(VarLongWritable userID,
+ Iterable<PrefAndSimilarityColumnWritable> values,
+ Context context) throws IOException, InterruptedException {
+ /* having boolean data, each estimated preference can only be 1,
+ * however we can't use this to rank the recommended items,
+ * so we use the sum of similarities for that. */
+ Iterator<PrefAndSimilarityColumnWritable> columns = values.iterator();
+ Vector predictions = columns.next().getSimilarityColumn();
+ while (columns.hasNext()) {
+ predictions.assign(columns.next().getSimilarityColumn(), Functions.PLUS);
+ }
+ writeRecommendedItems(userID, predictions, context);
+ }
+
+ private void reduceNonBooleanData(VarLongWritable userID,
+ Iterable<PrefAndSimilarityColumnWritable> values,
+ Context context) throws IOException, InterruptedException {
+ /* each entry here is the sum in the numerator of the prediction formula */
+ Vector numerators = null;
+ /* each entry here is the sum in the denominator of the prediction formula */
+ Vector denominators = null;
+ /* each entry here is the number of similar items used in the prediction formula */
+ Vector numberOfSimilarItemsUsed = new RandomAccessSparseVector(Integer.MAX_VALUE, 100);
+
+ for (PrefAndSimilarityColumnWritable prefAndSimilarityColumn : values) {
+ Vector simColumn = prefAndSimilarityColumn.getSimilarityColumn();
+ float prefValue = prefAndSimilarityColumn.getPrefValue();
+ /* count the number of items used for each prediction */
+ for (Element e : simColumn.nonZeroes()) {
+ int itemIDIndex = e.index();
+ numberOfSimilarItemsUsed.setQuick(itemIDIndex, numberOfSimilarItemsUsed.getQuick(itemIDIndex) + 1);
+ }
+
+ if (denominators == null) {
+ denominators = simColumn.clone();
+ } else {
+ denominators.assign(simColumn, Functions.PLUS_ABS);
+ }
+
+ if (numerators == null) {
+ numerators = simColumn.clone();
+ if (prefValue != BOOLEAN_PREF_VALUE) {
+ numerators.assign(Functions.MULT, prefValue);
+ }
+ } else {
+ if (prefValue != BOOLEAN_PREF_VALUE) {
+ simColumn.assign(Functions.MULT, prefValue);
+ }
+ numerators.assign(simColumn, Functions.PLUS);
+ }
+
+ }
+
+ if (numerators == null) {
+ return;
+ }
+
+ Vector recommendationVector = new RandomAccessSparseVector(Integer.MAX_VALUE, 100);
+ for (Element element : numerators.nonZeroes()) {
+ int itemIDIndex = element.index();
+ /* preference estimations must be based on at least 2 datapoints */
+ if (numberOfSimilarItemsUsed.getQuick(itemIDIndex) > 1) {
+ /* compute normalized prediction */
+ double prediction = element.get() / denominators.getQuick(itemIDIndex);
+ recommendationVector.setQuick(itemIDIndex, prediction);
+ }
+ }
+ writeRecommendedItems(userID, recommendationVector, context);
+ }
+
+ /**
+ * find the top entries in recommendationVector, map them to the real itemIDs and write back the result
+ */
+ private void writeRecommendedItems(VarLongWritable userID, Vector recommendationVector, Context context)
+ throws IOException, InterruptedException {
+ TopItemsQueue topKItems = new TopItemsQueue(recommendationsPerUser);
+ FastIDSet itemsForUser = null;
+
+ if (idReader != null && idReader.isUserItemFilterSpecified()) {
+ itemsForUser = idReader.getItemsToRecommendForUser(userID.get());
+ }
+
+ for (Element element : recommendationVector.nonZeroes()) {
+ int index = element.index();
+ long itemID;
+ if (indexItemIDMap != null && !indexItemIDMap.isEmpty()) {
+ itemID = indexItemIDMap.get(index);
+ } else { // we don't have any mappings, so just use the original
+ itemID = index;
+ }
+
+ if (shouldIncludeItemIntoRecommendations(itemID, itemsToRecommendFor, itemsForUser)) {
+
+ float value = (float) element.get();
+ if (!Float.isNaN(value)) {
+
+ MutableRecommendedItem topItem = topKItems.top();
+ if (value > topItem.getValue()) {
+ topItem.set(itemID, value);
+ topKItems.updateTop();
+ }
+ }
+ }
+ }
+
+ List<RecommendedItem> topItems = topKItems.getTopItems();
+ if (!topItems.isEmpty()) {
+ recommendedItems.set(topItems);
+ context.write(userID, recommendedItems);
+ }
+ }
+
+ private boolean shouldIncludeItemIntoRecommendations(long itemID, FastIDSet allItemsToRecommendFor,
+ FastIDSet itemsForUser) {
+ if (allItemsToRecommendFor == null && itemsForUser == null) {
+ return true;
+ } else if (itemsForUser != null) {
+ return itemsForUser.contains(itemID);
+ } else {
+ return allItemsToRecommendFor.contains(itemID);
+ }
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/IDReader.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/IDReader.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/IDReader.java
new file mode 100644
index 0000000..7797fe9
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/IDReader.java
@@ -0,0 +1,244 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.item;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.regex.Pattern;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.iterator.FileLineIterable;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Reads user ids and item ids from files specified in usersFile, itemsFile or userItemFile options in item-based
+ * recommender. Composes a list of users and a list of items which can be used by
+ * {@link org.apache.mahout.cf.taste.hadoop.item.UserVectorSplitterMapper} and
+ * {@link org.apache.mahout.cf.taste.hadoop.item.AggregateAndRecommendReducer}.
+ */
+public class IDReader {
+
+ static final String USER_ITEM_FILE = "userItemFile";
+
+ private static final Logger log = LoggerFactory.getLogger(IDReader.class);
+ private static final Pattern SEPARATOR = Pattern.compile("[\t,]");
+
+ private Configuration conf;
+
+ private String usersFile;
+ private String itemsFile;
+ private String userItemFile;
+
+ private FastIDSet userIds;
+ private FastIDSet itemIds;
+
+ private FastIDSet emptySet;
+
+ /* Key - user id, value - a set of item ids to include into recommendations for this user */
+ private Map<Long, FastIDSet> userItemFilter;
+
+ /**
+ * Creates a new IDReader
+ *
+ * @param conf Job configuration
+ */
+ public IDReader(Configuration conf) {
+ this.conf = conf;
+ emptySet = new FastIDSet();
+
+ usersFile = conf.get(UserVectorSplitterMapper.USERS_FILE);
+ itemsFile = conf.get(AggregateAndRecommendReducer.ITEMS_FILE);
+ userItemFile = conf.get(USER_ITEM_FILE);
+ }
+
+ /**
+ * Reads user ids and item ids from files specified in a job configuration
+ *
+ * @throws IOException if an error occurs during file read operation
+ *
+ * @throws IllegalStateException if userItemFile option is specified together with usersFile or itemsFile
+ */
+ public void readIDs() throws IOException, IllegalStateException {
+ if (isUserItemFileSpecified()) {
+ readUserItemFilterIfNeeded();
+ }
+
+ if (isUsersFileSpecified() || isUserItemFilterSpecified()) {
+ readUserIds();
+ }
+
+ if (isItemsFileSpecified() || isUserItemFilterSpecified()) {
+ readItemIds();
+ }
+ }
+
+ /**
+ * Gets a collection of items which should be recommended for a user
+ *
+ * @param userId ID of a user we are interested in
+ * @return if a userItemFile option is specified, and that file contains at least one item ID for the user,
+ * then this method returns a {@link FastIDSet} object populated with item IDs. Otherwise, this
+ * method returns an empty set.
+ */
+ public FastIDSet getItemsToRecommendForUser(Long userId) {
+ if (isUserItemFilterSpecified() && userItemFilter.containsKey(userId)) {
+ return userItemFilter.get(userId);
+ } else {
+ return emptySet;
+ }
+ }
+
+ private void readUserIds() throws IOException, IllegalStateException {
+ if (isUsersFileSpecified() && !isUserItemFileSpecified()) {
+ userIds = readIDList(usersFile);
+ } else if (isUserItemFileSpecified() && !isUsersFileSpecified()) {
+ readUserItemFilterIfNeeded();
+ userIds = extractAllUserIdsFromUserItemFilter(userItemFilter);
+ } else if (!isUsersFileSpecified()) {
+ throw new IllegalStateException("Neither usersFile nor userItemFile options are specified");
+ } else {
+ throw new IllegalStateException("usersFile and userItemFile options cannot be used simultaneously");
+ }
+ }
+
+ private void readItemIds() throws IOException, IllegalStateException {
+ if (isItemsFileSpecified() && !isUserItemFileSpecified()) {
+ itemIds = readIDList(itemsFile);
+ } else if (isUserItemFileSpecified() && !isItemsFileSpecified()) {
+ readUserItemFilterIfNeeded();
+ itemIds = extractAllItemIdsFromUserItemFilter(userItemFilter);
+ } else if (!isItemsFileSpecified()) {
+ throw new IllegalStateException("Neither itemsFile nor userItemFile options are specified");
+ } else {
+ throw new IllegalStateException("itemsFile and userItemFile options cannot be specified simultaneously");
+ }
+ }
+
+ private void readUserItemFilterIfNeeded() throws IOException {
+ if (!isUserItemFilterSpecified() && isUserItemFileSpecified()) {
+ userItemFilter = readUserItemFilter(userItemFile);
+ }
+ }
+
+ private Map<Long, FastIDSet> readUserItemFilter(String pathString) throws IOException {
+ Map<Long, FastIDSet> result = new HashMap<>();
+
+ try (InputStream in = openFile(pathString)) {
+ for (String line : new FileLineIterable(in)) {
+ try {
+ String[] tokens = SEPARATOR.split(line);
+ Long userId = Long.parseLong(tokens[0]);
+ Long itemId = Long.parseLong(tokens[1]);
+
+ addUserAndItemIdToUserItemFilter(result, userId, itemId);
+ } catch (NumberFormatException nfe) {
+ log.warn("userItemFile line ignored: {}", line);
+ }
+ }
+ }
+
+ return result;
+ }
+
+ void addUserAndItemIdToUserItemFilter(Map<Long, FastIDSet> filter, Long userId, Long itemId) {
+ FastIDSet itemIds;
+
+ if (filter.containsKey(userId)) {
+ itemIds = filter.get(userId);
+ } else {
+ itemIds = new FastIDSet();
+ filter.put(userId, itemIds);
+ }
+
+ itemIds.add(itemId);
+ }
+
+ static FastIDSet extractAllUserIdsFromUserItemFilter(Map<Long, FastIDSet> filter) {
+ FastIDSet result = new FastIDSet();
+
+ for (Long userId : filter.keySet()) {
+ result.add(userId);
+ }
+
+ return result;
+ }
+
+ private FastIDSet extractAllItemIdsFromUserItemFilter(Map<Long, FastIDSet> filter) {
+ FastIDSet result = new FastIDSet();
+
+ for (FastIDSet itemIds : filter.values()) {
+ result.addAll(itemIds);
+ }
+
+ return result;
+ }
+
+ private FastIDSet readIDList(String pathString) throws IOException {
+ FastIDSet result = null;
+
+ if (pathString != null) {
+ result = new FastIDSet();
+
+ try (InputStream in = openFile(pathString)){
+ for (String line : new FileLineIterable(in)) {
+ try {
+ result.add(Long.parseLong(line));
+ } catch (NumberFormatException nfe) {
+ log.warn("line ignored: {}", line);
+ }
+ }
+ }
+ }
+
+ return result;
+ }
+
+ private InputStream openFile(String pathString) throws IOException {
+ return HadoopUtil.openStream(new Path(pathString), conf);
+ }
+
+ public boolean isUsersFileSpecified () {
+ return usersFile != null;
+ }
+
+ public boolean isItemsFileSpecified () {
+ return itemsFile != null;
+ }
+
+ public boolean isUserItemFileSpecified () {
+ return userItemFile != null;
+ }
+
+ public boolean isUserItemFilterSpecified() {
+ return userItemFilter != null;
+ }
+
+ public FastIDSet getUserIds() {
+ return userIds;
+ }
+
+ public FastIDSet getItemIds() {
+ return itemIds;
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ItemFilterAsVectorAndPrefsReducer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ItemFilterAsVectorAndPrefsReducer.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ItemFilterAsVectorAndPrefsReducer.java
new file mode 100644
index 0000000..4415a55
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ItemFilterAsVectorAndPrefsReducer.java
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.item;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.mahout.cf.taste.hadoop.TasteHadoopUtils;
+import org.apache.mahout.math.RandomAccessSparseVector;
+import org.apache.mahout.math.VarIntWritable;
+import org.apache.mahout.math.VarLongWritable;
+import org.apache.mahout.math.Vector;
+
+/**
+ * we use a neat little trick to explicitly filter items for some users: we inject a NaN summand into the preference
+ * estimation for those items, which makes {@link org.apache.mahout.cf.taste.hadoop.item.AggregateAndRecommendReducer}
+ * automatically exclude them
+ */
+public class ItemFilterAsVectorAndPrefsReducer
+ extends Reducer<VarLongWritable,VarLongWritable,VarIntWritable,VectorAndPrefsWritable> {
+
+ private final VarIntWritable itemIDIndexWritable = new VarIntWritable();
+ private final VectorAndPrefsWritable vectorAndPrefs = new VectorAndPrefsWritable();
+
+ @Override
+ protected void reduce(VarLongWritable itemID, Iterable<VarLongWritable> values, Context ctx)
+ throws IOException, InterruptedException {
+
+ int itemIDIndex = TasteHadoopUtils.idToIndex(itemID.get());
+ Vector vector = new RandomAccessSparseVector(Integer.MAX_VALUE, 1);
+ /* artificial NaN summand to exclude this item from the recommendations for all users specified in userIDs */
+ vector.set(itemIDIndex, Double.NaN);
+
+ List<Long> userIDs = new ArrayList<>();
+ List<Float> prefValues = new ArrayList<>();
+ for (VarLongWritable userID : values) {
+ userIDs.add(userID.get());
+ prefValues.add(1.0f);
+ }
+
+ itemIDIndexWritable.set(itemIDIndex);
+ vectorAndPrefs.set(vector, userIDs, prefValues);
+ ctx.write(itemIDIndexWritable, vectorAndPrefs);
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ItemFilterMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ItemFilterMapper.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ItemFilterMapper.java
new file mode 100644
index 0000000..cdc1ddf
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ItemFilterMapper.java
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.item;
+
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.math.VarLongWritable;
+
+import java.io.IOException;
+import java.util.regex.Pattern;
+
+/**
+ * map out all user/item pairs to filter, keyed by the itemID
+ */
+public class ItemFilterMapper extends Mapper<LongWritable,Text,VarLongWritable,VarLongWritable> {
+
+ private static final Pattern SEPARATOR = Pattern.compile("[\t,]");
+
+ private final VarLongWritable itemIDWritable = new VarLongWritable();
+ private final VarLongWritable userIDWritable = new VarLongWritable();
+
+ @Override
+ protected void map(LongWritable key, Text line, Context ctx) throws IOException, InterruptedException {
+ String[] tokens = SEPARATOR.split(line.toString());
+ long userID = Long.parseLong(tokens[0]);
+ long itemID = Long.parseLong(tokens[1]);
+ itemIDWritable.set(itemID);
+ userIDWritable.set(userID);
+ ctx.write(itemIDWritable, userIDWritable);
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ItemIDIndexMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ItemIDIndexMapper.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ItemIDIndexMapper.java
new file mode 100644
index 0000000..ac8597e
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ItemIDIndexMapper.java
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.item;
+
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.cf.taste.hadoop.TasteHadoopUtils;
+import org.apache.mahout.cf.taste.hadoop.ToEntityPrefsMapper;
+import org.apache.mahout.math.VarIntWritable;
+import org.apache.mahout.math.VarLongWritable;
+
+public final class ItemIDIndexMapper extends
+ Mapper<LongWritable,Text, VarIntWritable, VarLongWritable> {
+
+ private boolean transpose;
+
+ private final VarIntWritable indexWritable = new VarIntWritable();
+ private final VarLongWritable itemIDWritable = new VarLongWritable();
+
+ @Override
+ protected void setup(Context context) {
+ Configuration jobConf = context.getConfiguration();
+ transpose = jobConf.getBoolean(ToEntityPrefsMapper.TRANSPOSE_USER_ITEM, false);
+ }
+
+ @Override
+ protected void map(LongWritable key,
+ Text value,
+ Context context) throws IOException, InterruptedException {
+ String[] tokens = TasteHadoopUtils.splitPrefTokens(value.toString());
+ long itemID = Long.parseLong(tokens[transpose ? 0 : 1]);
+ int index = TasteHadoopUtils.idToIndex(itemID);
+ indexWritable.set(index);
+ itemIDWritable.set(itemID);
+ context.write(indexWritable, itemIDWritable);
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ItemIDIndexReducer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ItemIDIndexReducer.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ItemIDIndexReducer.java
new file mode 100644
index 0000000..d9ecf5e
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ItemIDIndexReducer.java
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.item;
+
+import java.io.IOException;
+
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.mahout.math.VarIntWritable;
+import org.apache.mahout.math.VarLongWritable;
+
+public final class ItemIDIndexReducer extends
+ Reducer<VarIntWritable, VarLongWritable, VarIntWritable,VarLongWritable> {
+
+ private final VarLongWritable minimumItemIDWritable = new VarLongWritable();
+
+ @Override
+ protected void reduce(VarIntWritable index,
+ Iterable<VarLongWritable> possibleItemIDs,
+ Context context) throws IOException, InterruptedException {
+ long minimumItemID = Long.MAX_VALUE;
+ for (VarLongWritable varLongWritable : possibleItemIDs) {
+ long itemID = varLongWritable.get();
+ if (itemID < minimumItemID) {
+ minimumItemID = itemID;
+ }
+ }
+ if (minimumItemID != Long.MAX_VALUE) {
+ minimumItemIDWritable.set(minimumItemID);
+ context.write(index, minimumItemIDWritable);
+ }
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/PartialMultiplyMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/PartialMultiplyMapper.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/PartialMultiplyMapper.java
new file mode 100644
index 0000000..0e818f3
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/PartialMultiplyMapper.java
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.item;
+
+import java.io.IOException;
+import java.util.List;
+
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.math.VarIntWritable;
+import org.apache.mahout.math.VarLongWritable;
+import org.apache.mahout.math.Vector;
+
+/**
+ * maps similar items and their preference values per user
+ */
+public final class PartialMultiplyMapper extends
+ Mapper<VarIntWritable,VectorAndPrefsWritable,VarLongWritable,PrefAndSimilarityColumnWritable> {
+
+ private final VarLongWritable userIDWritable = new VarLongWritable();
+ private final PrefAndSimilarityColumnWritable prefAndSimilarityColumn = new PrefAndSimilarityColumnWritable();
+
+ @Override
+ protected void map(VarIntWritable key,
+ VectorAndPrefsWritable vectorAndPrefsWritable,
+ Context context) throws IOException, InterruptedException {
+
+ Vector similarityMatrixColumn = vectorAndPrefsWritable.getVector();
+ List<Long> userIDs = vectorAndPrefsWritable.getUserIDs();
+ List<Float> prefValues = vectorAndPrefsWritable.getValues();
+
+ for (int i = 0; i < userIDs.size(); i++) {
+ long userID = userIDs.get(i);
+ float prefValue = prefValues.get(i);
+ if (!Float.isNaN(prefValue)) {
+ prefAndSimilarityColumn.set(prefValue, similarityMatrixColumn);
+ userIDWritable.set(userID);
+ context.write(userIDWritable, prefAndSimilarityColumn);
+ }
+ }
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/PrefAndSimilarityColumnWritable.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/PrefAndSimilarityColumnWritable.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/PrefAndSimilarityColumnWritable.java
new file mode 100644
index 0000000..704c74a
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/PrefAndSimilarityColumnWritable.java
@@ -0,0 +1,85 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.item;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+
+public final class PrefAndSimilarityColumnWritable implements Writable {
+
+ private float prefValue;
+ private Vector similarityColumn;
+
+ public PrefAndSimilarityColumnWritable() {
+ }
+
+ public PrefAndSimilarityColumnWritable(float prefValue, Vector similarityColumn) {
+ set(prefValue, similarityColumn);
+ }
+
+ public void set(float prefValue, Vector similarityColumn) {
+ this.prefValue = prefValue;
+ this.similarityColumn = similarityColumn;
+ }
+
+ public float getPrefValue() {
+ return prefValue;
+ }
+
+ public Vector getSimilarityColumn() {
+ return similarityColumn;
+ }
+
+ @Override
+ public void readFields(DataInput in) throws IOException {
+ prefValue = in.readFloat();
+ VectorWritable vw = new VectorWritable();
+ vw.readFields(in);
+ similarityColumn = vw.get();
+ }
+
+ @Override
+ public void write(DataOutput out) throws IOException {
+ out.writeFloat(prefValue);
+ VectorWritable vw = new VectorWritable(similarityColumn);
+ vw.setWritesLaxPrecision(true);
+ vw.write(out);
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (obj instanceof PrefAndSimilarityColumnWritable) {
+ PrefAndSimilarityColumnWritable other = (PrefAndSimilarityColumnWritable) obj;
+ return prefValue == other.prefValue && similarityColumn.equals(other.similarityColumn);
+ }
+ return false;
+ }
+
+ @Override
+ public int hashCode() {
+ return RandomUtils.hashFloat(prefValue) + 31 * similarityColumn.hashCode();
+ }
+
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/RecommenderJob.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/RecommenderJob.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/RecommenderJob.java
new file mode 100644
index 0000000..129db1d
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/RecommenderJob.java
@@ -0,0 +1,337 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.item;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.DoubleWritable;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.JobContext;
+import org.apache.hadoop.mapreduce.OutputFormat;
+import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.cf.taste.hadoop.EntityEntityWritable;
+import org.apache.mahout.cf.taste.hadoop.RecommendedItemsWritable;
+import org.apache.mahout.cf.taste.hadoop.preparation.PreparePreferenceMatrixJob;
+import org.apache.mahout.cf.taste.hadoop.similarity.item.ItemSimilarityJob;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.math.VarIntWritable;
+import org.apache.mahout.math.VarLongWritable;
+import org.apache.mahout.math.hadoop.similarity.cooccurrence.RowSimilarityJob;
+import org.apache.mahout.math.hadoop.similarity.cooccurrence.measures.VectorSimilarityMeasures;
+
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * <p>Runs a completely distributed recommender job as a series of mapreduces.</p>
+ * <p/>
+ * <p>Preferences in the input file should look like {@code userID, itemID[, preferencevalue]}</p>
+ * <p/>
+ * <p>
+ * Preference value is optional to accommodate applications that have no notion of a preference value (that is, the user
+ * simply expresses a preference for an item, but no degree of preference).
+ * </p>
+ * <p/>
+ * <p>
+ * The preference value is assumed to be parseable as a {@code double}. The user IDs and item IDs are
+ * parsed as {@code long}s.
+ * </p>
+ * <p/>
+ * <p>Command line arguments specific to this class are:</p>
+ * <p/>
+ * <ol>
+ * <li>--input(path): Directory containing one or more text files with the preference data</li>
+ * <li>--output(path): output path where recommender output should go</li>
+ * <li>--similarityClassname (classname): Name of vector similarity class to instantiate or a predefined similarity
+ * from {@link org.apache.mahout.math.hadoop.similarity.cooccurrence.measures.VectorSimilarityMeasure}</li>
+ * <li>--usersFile (path): only compute recommendations for user IDs contained in this file (optional)</li>
+ * <li>--itemsFile (path): only include item IDs from this file in the recommendations (optional)</li>
+ * <li>--filterFile (path): file containing comma-separated userID,itemID pairs. Used to exclude the item from the
+ * recommendations for that user (optional)</li>
+ * <li>--numRecommendations (integer): Number of recommendations to compute per user (10)</li>
+ * <li>--booleanData (boolean): Treat input data as having no pref values (false)</li>
+ * <li>--maxPrefsPerUser (integer): Maximum number of preferences considered per user in final
+ * recommendation phase (10)</li>
+ * <li>--maxSimilaritiesPerItem (integer): Maximum number of similarities considered per item (100)</li>
+ * <li>--minPrefsPerUser (integer): ignore users with less preferences than this in the similarity computation (1)</li>
+ * <li>--maxPrefsPerUserInItemSimilarity (integer): max number of preferences to consider per user in
+ * the item similarity computation phase,
+ * users with more preferences will be sampled down (1000)</li>
+ * <li>--threshold (double): discard item pairs with a similarity value below this</li>
+ * </ol>
+ * <p/>
+ * <p>General command line options are documented in {@link AbstractJob}.</p>
+ * <p/>
+ * <p>Note that because of how Hadoop parses arguments, all "-D" arguments must appear before all other
+ * arguments.</p>
+ */
+public final class RecommenderJob extends AbstractJob {
+
+ public static final String BOOLEAN_DATA = "booleanData";
+ public static final String DEFAULT_PREPARE_PATH = "preparePreferenceMatrix";
+
+ private static final int DEFAULT_MAX_SIMILARITIES_PER_ITEM = 100;
+ private static final int DEFAULT_MAX_PREFS = 500;
+ private static final int DEFAULT_MIN_PREFS_PER_USER = 1;
+
+ @Override
+ public int run(String[] args) throws Exception {
+
+ addInputOption();
+ addOutputOption();
+ addOption("numRecommendations", "n", "Number of recommendations per user",
+ String.valueOf(AggregateAndRecommendReducer.DEFAULT_NUM_RECOMMENDATIONS));
+ addOption("usersFile", null, "File of users to recommend for", null);
+ addOption("itemsFile", null, "File of items to recommend for", null);
+ addOption("filterFile", "f", "File containing comma-separated userID,itemID pairs. Used to exclude the item from "
+ + "the recommendations for that user (optional)", null);
+ addOption("userItemFile", "uif", "File containing comma-separated userID,itemID pairs (optional). "
+ + "Used to include only these items into recommendations. "
+ + "Cannot be used together with usersFile or itemsFile", null);
+ addOption("booleanData", "b", "Treat input as without pref values", Boolean.FALSE.toString());
+ addOption("maxPrefsPerUser", "mxp",
+ "Maximum number of preferences considered per user in final recommendation phase",
+ String.valueOf(UserVectorSplitterMapper.DEFAULT_MAX_PREFS_PER_USER_CONSIDERED));
+ addOption("minPrefsPerUser", "mp", "ignore users with less preferences than this in the similarity computation "
+ + "(default: " + DEFAULT_MIN_PREFS_PER_USER + ')', String.valueOf(DEFAULT_MIN_PREFS_PER_USER));
+ addOption("maxSimilaritiesPerItem", "m", "Maximum number of similarities considered per item ",
+ String.valueOf(DEFAULT_MAX_SIMILARITIES_PER_ITEM));
+ addOption("maxPrefsInItemSimilarity", "mpiis", "max number of preferences to consider per user or item in the "
+ + "item similarity computation phase, users or items with more preferences will be sampled down (default: "
+ + DEFAULT_MAX_PREFS + ')', String.valueOf(DEFAULT_MAX_PREFS));
+ addOption("similarityClassname", "s", "Name of distributed similarity measures class to instantiate, "
+ + "alternatively use one of the predefined similarities (" + VectorSimilarityMeasures.list() + ')', true);
+ addOption("threshold", "tr", "discard item pairs with a similarity value below this", false);
+ addOption("outputPathForSimilarityMatrix", "opfsm", "write the item similarity matrix to this path (optional)",
+ false);
+ addOption("randomSeed", null, "use this seed for sampling", false);
+ addFlag("sequencefileOutput", null, "write the output into a SequenceFile instead of a text file");
+
+ Map<String, List<String>> parsedArgs = parseArguments(args);
+ if (parsedArgs == null) {
+ return -1;
+ }
+
+ Path outputPath = getOutputPath();
+ int numRecommendations = Integer.parseInt(getOption("numRecommendations"));
+ String usersFile = getOption("usersFile");
+ String itemsFile = getOption("itemsFile");
+ String filterFile = getOption("filterFile");
+ String userItemFile = getOption("userItemFile");
+ boolean booleanData = Boolean.valueOf(getOption("booleanData"));
+ int maxPrefsPerUser = Integer.parseInt(getOption("maxPrefsPerUser"));
+ int minPrefsPerUser = Integer.parseInt(getOption("minPrefsPerUser"));
+ int maxPrefsInItemSimilarity = Integer.parseInt(getOption("maxPrefsInItemSimilarity"));
+ int maxSimilaritiesPerItem = Integer.parseInt(getOption("maxSimilaritiesPerItem"));
+ String similarityClassname = getOption("similarityClassname");
+ double threshold = hasOption("threshold")
+ ? Double.parseDouble(getOption("threshold")) : RowSimilarityJob.NO_THRESHOLD;
+ long randomSeed = hasOption("randomSeed")
+ ? Long.parseLong(getOption("randomSeed")) : RowSimilarityJob.NO_FIXED_RANDOM_SEED;
+
+
+ Path prepPath = getTempPath(DEFAULT_PREPARE_PATH);
+ Path similarityMatrixPath = getTempPath("similarityMatrix");
+ Path explicitFilterPath = getTempPath("explicitFilterPath");
+ Path partialMultiplyPath = getTempPath("partialMultiply");
+
+ AtomicInteger currentPhase = new AtomicInteger();
+
+ int numberOfUsers = -1;
+
+ if (shouldRunNextPhase(parsedArgs, currentPhase)) {
+ ToolRunner.run(getConf(), new PreparePreferenceMatrixJob(), new String[]{
+ "--input", getInputPath().toString(),
+ "--output", prepPath.toString(),
+ "--minPrefsPerUser", String.valueOf(minPrefsPerUser),
+ "--booleanData", String.valueOf(booleanData),
+ "--tempDir", getTempPath().toString(),
+ });
+
+ numberOfUsers = HadoopUtil.readInt(new Path(prepPath, PreparePreferenceMatrixJob.NUM_USERS), getConf());
+ }
+
+
+ if (shouldRunNextPhase(parsedArgs, currentPhase)) {
+
+ /* special behavior if phase 1 is skipped */
+ if (numberOfUsers == -1) {
+ numberOfUsers = (int) HadoopUtil.countRecords(new Path(prepPath, PreparePreferenceMatrixJob.USER_VECTORS),
+ PathType.LIST, null, getConf());
+ }
+
+ //calculate the co-occurrence matrix
+ ToolRunner.run(getConf(), new RowSimilarityJob(), new String[]{
+ "--input", new Path(prepPath, PreparePreferenceMatrixJob.RATING_MATRIX).toString(),
+ "--output", similarityMatrixPath.toString(),
+ "--numberOfColumns", String.valueOf(numberOfUsers),
+ "--similarityClassname", similarityClassname,
+ "--maxObservationsPerRow", String.valueOf(maxPrefsInItemSimilarity),
+ "--maxObservationsPerColumn", String.valueOf(maxPrefsInItemSimilarity),
+ "--maxSimilaritiesPerRow", String.valueOf(maxSimilaritiesPerItem),
+ "--excludeSelfSimilarity", String.valueOf(Boolean.TRUE),
+ "--threshold", String.valueOf(threshold),
+ "--randomSeed", String.valueOf(randomSeed),
+ "--tempDir", getTempPath().toString(),
+ });
+
+ // write out the similarity matrix if the user specified that behavior
+ if (hasOption("outputPathForSimilarityMatrix")) {
+ Path outputPathForSimilarityMatrix = new Path(getOption("outputPathForSimilarityMatrix"));
+
+ Job outputSimilarityMatrix = prepareJob(similarityMatrixPath, outputPathForSimilarityMatrix,
+ SequenceFileInputFormat.class, ItemSimilarityJob.MostSimilarItemPairsMapper.class,
+ EntityEntityWritable.class, DoubleWritable.class, ItemSimilarityJob.MostSimilarItemPairsReducer.class,
+ EntityEntityWritable.class, DoubleWritable.class, TextOutputFormat.class);
+
+ Configuration mostSimilarItemsConf = outputSimilarityMatrix.getConfiguration();
+ mostSimilarItemsConf.set(ItemSimilarityJob.ITEM_ID_INDEX_PATH_STR,
+ new Path(prepPath, PreparePreferenceMatrixJob.ITEMID_INDEX).toString());
+ mostSimilarItemsConf.setInt(ItemSimilarityJob.MAX_SIMILARITIES_PER_ITEM, maxSimilaritiesPerItem);
+ outputSimilarityMatrix.waitForCompletion(true);
+ }
+ }
+
+ //start the multiplication of the co-occurrence matrix by the user vectors
+ if (shouldRunNextPhase(parsedArgs, currentPhase)) {
+ Job partialMultiply = Job.getInstance(getConf(), "partialMultiply");
+ Configuration partialMultiplyConf = partialMultiply.getConfiguration();
+
+ MultipleInputs.addInputPath(partialMultiply, similarityMatrixPath, SequenceFileInputFormat.class,
+ SimilarityMatrixRowWrapperMapper.class);
+ MultipleInputs.addInputPath(partialMultiply, new Path(prepPath, PreparePreferenceMatrixJob.USER_VECTORS),
+ SequenceFileInputFormat.class, UserVectorSplitterMapper.class);
+ partialMultiply.setJarByClass(ToVectorAndPrefReducer.class);
+ partialMultiply.setMapOutputKeyClass(VarIntWritable.class);
+ partialMultiply.setMapOutputValueClass(VectorOrPrefWritable.class);
+ partialMultiply.setReducerClass(ToVectorAndPrefReducer.class);
+ partialMultiply.setOutputFormatClass(SequenceFileOutputFormat.class);
+ partialMultiply.setOutputKeyClass(VarIntWritable.class);
+ partialMultiply.setOutputValueClass(VectorAndPrefsWritable.class);
+ partialMultiplyConf.setBoolean("mapred.compress.map.output", true);
+ partialMultiplyConf.set("mapred.output.dir", partialMultiplyPath.toString());
+
+ if (usersFile != null) {
+ partialMultiplyConf.set(UserVectorSplitterMapper.USERS_FILE, usersFile);
+ }
+
+ if (userItemFile != null) {
+ partialMultiplyConf.set(IDReader.USER_ITEM_FILE, userItemFile);
+ }
+
+ partialMultiplyConf.setInt(UserVectorSplitterMapper.MAX_PREFS_PER_USER_CONSIDERED, maxPrefsPerUser);
+
+ boolean succeeded = partialMultiply.waitForCompletion(true);
+ if (!succeeded) {
+ return -1;
+ }
+ }
+
+ if (shouldRunNextPhase(parsedArgs, currentPhase)) {
+ //filter out any users we don't care about
+ /* convert the user/item pairs to filter if a filterfile has been specified */
+ if (filterFile != null) {
+ Job itemFiltering = prepareJob(new Path(filterFile), explicitFilterPath, TextInputFormat.class,
+ ItemFilterMapper.class, VarLongWritable.class, VarLongWritable.class,
+ ItemFilterAsVectorAndPrefsReducer.class, VarIntWritable.class, VectorAndPrefsWritable.class,
+ SequenceFileOutputFormat.class);
+ boolean succeeded = itemFiltering.waitForCompletion(true);
+ if (!succeeded) {
+ return -1;
+ }
+ }
+
+ String aggregateAndRecommendInput = partialMultiplyPath.toString();
+ if (filterFile != null) {
+ aggregateAndRecommendInput += "," + explicitFilterPath;
+ }
+
+ Class<? extends OutputFormat> outputFormat = parsedArgs.containsKey("--sequencefileOutput")
+ ? SequenceFileOutputFormat.class : TextOutputFormat.class;
+
+ //extract out the recommendations
+ Job aggregateAndRecommend = prepareJob(
+ new Path(aggregateAndRecommendInput), outputPath, SequenceFileInputFormat.class,
+ PartialMultiplyMapper.class, VarLongWritable.class, PrefAndSimilarityColumnWritable.class,
+ AggregateAndRecommendReducer.class, VarLongWritable.class, RecommendedItemsWritable.class,
+ outputFormat);
+ Configuration aggregateAndRecommendConf = aggregateAndRecommend.getConfiguration();
+ if (itemsFile != null) {
+ aggregateAndRecommendConf.set(AggregateAndRecommendReducer.ITEMS_FILE, itemsFile);
+ }
+
+ if (userItemFile != null) {
+ aggregateAndRecommendConf.set(IDReader.USER_ITEM_FILE, userItemFile);
+ }
+
+ if (filterFile != null) {
+ setS3SafeCombinedInputPath(aggregateAndRecommend, getTempPath(), partialMultiplyPath, explicitFilterPath);
+ }
+ setIOSort(aggregateAndRecommend);
+ aggregateAndRecommendConf.set(AggregateAndRecommendReducer.ITEMID_INDEX_PATH,
+ new Path(prepPath, PreparePreferenceMatrixJob.ITEMID_INDEX).toString());
+ aggregateAndRecommendConf.setInt(AggregateAndRecommendReducer.NUM_RECOMMENDATIONS, numRecommendations);
+ aggregateAndRecommendConf.setBoolean(BOOLEAN_DATA, booleanData);
+ boolean succeeded = aggregateAndRecommend.waitForCompletion(true);
+ if (!succeeded) {
+ return -1;
+ }
+ }
+
+ return 0;
+ }
+
+ private static void setIOSort(JobContext job) {
+ Configuration conf = job.getConfiguration();
+ conf.setInt("io.sort.factor", 100);
+ String javaOpts = conf.get("mapred.map.child.java.opts"); // new arg name
+ if (javaOpts == null) {
+ javaOpts = conf.get("mapred.child.java.opts"); // old arg name
+ }
+ int assumedHeapSize = 512;
+ if (javaOpts != null) {
+ Matcher m = Pattern.compile("-Xmx([0-9]+)([mMgG])").matcher(javaOpts);
+ if (m.find()) {
+ assumedHeapSize = Integer.parseInt(m.group(1));
+ String megabyteOrGigabyte = m.group(2);
+ if ("g".equalsIgnoreCase(megabyteOrGigabyte)) {
+ assumedHeapSize *= 1024;
+ }
+ }
+ }
+ // Cap this at 1024MB now; see https://issues.apache.org/jira/browse/MAPREDUCE-2308
+ conf.setInt("io.sort.mb", Math.min(assumedHeapSize / 2, 1024));
+ // For some reason the Merger doesn't report status for a long time; increase
+ // timeout when running these jobs
+ conf.setInt("mapred.task.timeout", 60 * 60 * 1000);
+ }
+
+ public static void main(String[] args) throws Exception {
+ ToolRunner.run(new Configuration(), new RecommenderJob(), args);
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/SimilarityMatrixRowWrapperMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/SimilarityMatrixRowWrapperMapper.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/SimilarityMatrixRowWrapperMapper.java
new file mode 100644
index 0000000..8ae8215
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/SimilarityMatrixRowWrapperMapper.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.item;
+
+import java.io.IOException;
+
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.math.VarIntWritable;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+
+/**
+ * maps a row of the similarity matrix to a {@link VectorOrPrefWritable}
+ *
+ * actually a column from that matrix has to be used but as the similarity matrix is symmetric,
+ * we can use a row instead of having to transpose it
+ */
+public final class SimilarityMatrixRowWrapperMapper extends
+ Mapper<IntWritable,VectorWritable,VarIntWritable,VectorOrPrefWritable> {
+
+ private final VarIntWritable index = new VarIntWritable();
+ private final VectorOrPrefWritable vectorOrPref = new VectorOrPrefWritable();
+
+ @Override
+ protected void map(IntWritable key,
+ VectorWritable value,
+ Context context) throws IOException, InterruptedException {
+ Vector similarityMatrixRow = value.get();
+ /* remove self similarity */
+ similarityMatrixRow.set(key.get(), Double.NaN);
+
+ index.set(key.get());
+ vectorOrPref.set(similarityMatrixRow);
+
+ context.write(index, vectorOrPref);
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ToUserVectorsReducer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ToUserVectorsReducer.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ToUserVectorsReducer.java
new file mode 100644
index 0000000..e6e47fd
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ToUserVectorsReducer.java
@@ -0,0 +1,84 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.item;
+
+import java.io.IOException;
+
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.mahout.cf.taste.hadoop.EntityPrefWritable;
+import org.apache.mahout.cf.taste.hadoop.TasteHadoopUtils;
+import org.apache.mahout.math.RandomAccessSparseVector;
+import org.apache.mahout.math.VarLongWritable;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+
+/**
+ * <h1>Input</h1>
+ *
+ * <p>
+ * Takes user IDs as {@link VarLongWritable} mapped to all associated item IDs and preference values, as
+ * {@link EntityPrefWritable}s.
+ * </p>
+ *
+ * <h1>Output</h1>
+ *
+ * <p>
+ * The same user ID mapped to a {@link RandomAccessSparseVector} representation of the same item IDs and
+ * preference values. Item IDs are used as vector indexes; they are hashed into ints to work as indexes with
+ * {@link TasteHadoopUtils#idToIndex(long)}. The mapping is remembered for later with a combination of
+ * {@link ItemIDIndexMapper} and {@link ItemIDIndexReducer}.
+ * </p>
+ */
+public final class ToUserVectorsReducer extends
+ Reducer<VarLongWritable,VarLongWritable,VarLongWritable,VectorWritable> {
+
+ public static final String MIN_PREFERENCES_PER_USER = ToUserVectorsReducer.class.getName()
+ + ".minPreferencesPerUser";
+
+ private int minPreferences;
+
+ public enum Counters { USERS }
+
+ private final VectorWritable userVectorWritable = new VectorWritable();
+
+ @Override
+ protected void setup(Context ctx) throws IOException, InterruptedException {
+ super.setup(ctx);
+ minPreferences = ctx.getConfiguration().getInt(MIN_PREFERENCES_PER_USER, 1);
+ }
+
+ @Override
+ protected void reduce(VarLongWritable userID,
+ Iterable<VarLongWritable> itemPrefs,
+ Context context) throws IOException, InterruptedException {
+ Vector userVector = new RandomAccessSparseVector(Integer.MAX_VALUE, 100);
+ for (VarLongWritable itemPref : itemPrefs) {
+ int index = TasteHadoopUtils.idToIndex(itemPref.get());
+ float value = itemPref instanceof EntityPrefWritable ? ((EntityPrefWritable) itemPref).getPrefValue() : 1.0f;
+ userVector.set(index, value);
+ }
+
+ if (userVector.getNumNondefaultElements() >= minPreferences) {
+ userVectorWritable.set(userVector);
+ userVectorWritable.setWritesLaxPrecision(true);
+ context.getCounter(Counters.USERS).increment(1);
+ context.write(userID, userVectorWritable);
+ }
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ToVectorAndPrefReducer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ToVectorAndPrefReducer.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ToVectorAndPrefReducer.java
new file mode 100644
index 0000000..9167437
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ToVectorAndPrefReducer.java
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.item;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.mahout.math.VarIntWritable;
+import org.apache.mahout.math.Vector;
+
+public final class ToVectorAndPrefReducer extends
+ Reducer<VarIntWritable,VectorOrPrefWritable,VarIntWritable,VectorAndPrefsWritable> {
+
+ private final VectorAndPrefsWritable vectorAndPrefs = new VectorAndPrefsWritable();
+
+ @Override
+ protected void reduce(VarIntWritable key,
+ Iterable<VectorOrPrefWritable> values,
+ Context context) throws IOException, InterruptedException {
+
+ List<Long> userIDs = new ArrayList<>();
+ List<Float> prefValues = new ArrayList<>();
+ Vector similarityMatrixColumn = null;
+ for (VectorOrPrefWritable value : values) {
+ if (value.getVector() == null) {
+ // Then this is a user-pref value
+ userIDs.add(value.getUserID());
+ prefValues.add(value.getValue());
+ } else {
+ // Then this is the column vector
+ if (similarityMatrixColumn != null) {
+ throw new IllegalStateException("Found two similarity-matrix columns for item index " + key.get());
+ }
+ similarityMatrixColumn = value.getVector();
+ }
+ }
+
+ if (similarityMatrixColumn == null) {
+ return;
+ }
+
+ vectorAndPrefs.set(similarityMatrixColumn, userIDs, prefValues);
+ context.write(key, vectorAndPrefs);
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/UserVectorSplitterMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/UserVectorSplitterMapper.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/UserVectorSplitterMapper.java
new file mode 100644
index 0000000..2290d06
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/UserVectorSplitterMapper.java
@@ -0,0 +1,116 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.item;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.lucene.util.PriorityQueue;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.math.VarIntWritable;
+import org.apache.mahout.math.VarLongWritable;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.Vector.Element;
+import org.apache.mahout.math.VectorWritable;
+
+import java.io.IOException;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public final class UserVectorSplitterMapper extends
+ Mapper<VarLongWritable,VectorWritable, VarIntWritable,VectorOrPrefWritable> {
+
+ private static final Logger log = LoggerFactory.getLogger(UserVectorSplitterMapper.class);
+
+ static final String USERS_FILE = "usersFile";
+ static final String MAX_PREFS_PER_USER_CONSIDERED = "maxPrefsPerUserConsidered";
+ static final int DEFAULT_MAX_PREFS_PER_USER_CONSIDERED = 10;
+
+ private int maxPrefsPerUserConsidered;
+ private FastIDSet usersToRecommendFor;
+
+ private final VarIntWritable itemIndexWritable = new VarIntWritable();
+ private final VectorOrPrefWritable vectorOrPref = new VectorOrPrefWritable();
+
+ @Override
+ protected void setup(Context context) throws IOException {
+ Configuration jobConf = context.getConfiguration();
+ maxPrefsPerUserConsidered = jobConf.getInt(MAX_PREFS_PER_USER_CONSIDERED, DEFAULT_MAX_PREFS_PER_USER_CONSIDERED);
+
+ IDReader idReader = new IDReader (jobConf);
+ idReader.readIDs();
+ usersToRecommendFor = idReader.getUserIds();
+ }
+
+ @Override
+ protected void map(VarLongWritable key,
+ VectorWritable value,
+ Context context) throws IOException, InterruptedException {
+ long userID = key.get();
+
+ log.info("UserID = {}", userID);
+
+ if (usersToRecommendFor != null && !usersToRecommendFor.contains(userID)) {
+ return;
+ }
+ Vector userVector = maybePruneUserVector(value.get());
+
+ for (Element e : userVector.nonZeroes()) {
+ itemIndexWritable.set(e.index());
+ vectorOrPref.set(userID, (float) e.get());
+ context.write(itemIndexWritable, vectorOrPref);
+ }
+ }
+
+ private Vector maybePruneUserVector(Vector userVector) {
+ if (userVector.getNumNondefaultElements() <= maxPrefsPerUserConsidered) {
+ return userVector;
+ }
+
+ float smallestLargeValue = findSmallestLargeValue(userVector);
+
+ // "Blank out" small-sized prefs to reduce the amount of partial products
+ // generated later. They're not zeroed, but NaN-ed, so they come through
+ // and can be used to exclude these items from prefs.
+ for (Element e : userVector.nonZeroes()) {
+ float absValue = Math.abs((float) e.get());
+ if (absValue < smallestLargeValue) {
+ e.set(Float.NaN);
+ }
+ }
+
+ return userVector;
+ }
+
+ private float findSmallestLargeValue(Vector userVector) {
+
+ PriorityQueue<Float> topPrefValues = new PriorityQueue<Float>(maxPrefsPerUserConsidered) {
+ @Override
+ protected boolean lessThan(Float f1, Float f2) {
+ return f1 < f2;
+ }
+ };
+
+ for (Element e : userVector.nonZeroes()) {
+ float absValue = Math.abs((float) e.get());
+ topPrefValues.insertWithOverflow(absValue);
+ }
+ return topPrefValues.top();
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/VectorAndPrefsWritable.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/VectorAndPrefsWritable.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/VectorAndPrefsWritable.java
new file mode 100644
index 0000000..11d496f
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/VectorAndPrefsWritable.java
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.item;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.math.Varint;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+
+public final class VectorAndPrefsWritable implements Writable {
+
+ private Vector vector;
+ private List<Long> userIDs;
+ private List<Float> values;
+
+ public VectorAndPrefsWritable() {
+ }
+
+ public VectorAndPrefsWritable(Vector vector, List<Long> userIDs, List<Float> values) {
+ set(vector, userIDs, values);
+ }
+
+ public void set(Vector vector, List<Long> userIDs, List<Float> values) {
+ this.vector = vector;
+ this.userIDs = userIDs;
+ this.values = values;
+ }
+
+ public Vector getVector() {
+ return vector;
+ }
+
+ public List<Long> getUserIDs() {
+ return userIDs;
+ }
+
+ public List<Float> getValues() {
+ return values;
+ }
+
+ @Override
+ public void write(DataOutput out) throws IOException {
+ VectorWritable vw = new VectorWritable(vector);
+ vw.setWritesLaxPrecision(true);
+ vw.write(out);
+ Varint.writeUnsignedVarInt(userIDs.size(), out);
+ for (int i = 0; i < userIDs.size(); i++) {
+ Varint.writeSignedVarLong(userIDs.get(i), out);
+ out.writeFloat(values.get(i));
+ }
+ }
+
+ @Override
+ public void readFields(DataInput in) throws IOException {
+ VectorWritable writable = new VectorWritable();
+ writable.readFields(in);
+ vector = writable.get();
+ int size = Varint.readUnsignedVarInt(in);
+ userIDs = new ArrayList<>(size);
+ values = new ArrayList<>(size);
+ for (int i = 0; i < size; i++) {
+ userIDs.add(Varint.readSignedVarLong(in));
+ values.add(in.readFloat());
+ }
+ }
+
+ @Override
+ public String toString() {
+ return vector + "\t" + userIDs + '\t' + values;
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/VectorOrPrefWritable.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/VectorOrPrefWritable.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/VectorOrPrefWritable.java
new file mode 100644
index 0000000..515d7ea
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/item/VectorOrPrefWritable.java
@@ -0,0 +1,104 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.item;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.math.Varint;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+
+public final class VectorOrPrefWritable implements Writable {
+
+ private Vector vector;
+ private long userID;
+ private float value;
+
+ public VectorOrPrefWritable() {
+ }
+
+ public VectorOrPrefWritable(Vector vector) {
+ this.vector = vector;
+ }
+
+ public VectorOrPrefWritable(long userID, float value) {
+ this.userID = userID;
+ this.value = value;
+ }
+
+ public Vector getVector() {
+ return vector;
+ }
+
+ public long getUserID() {
+ return userID;
+ }
+
+ public float getValue() {
+ return value;
+ }
+
+ void set(Vector vector) {
+ this.vector = vector;
+ this.userID = Long.MIN_VALUE;
+ this.value = Float.NaN;
+ }
+
+ public void set(long userID, float value) {
+ this.vector = null;
+ this.userID = userID;
+ this.value = value;
+ }
+
+ @Override
+ public void write(DataOutput out) throws IOException {
+ if (vector == null) {
+ out.writeBoolean(false);
+ Varint.writeSignedVarLong(userID, out);
+ out.writeFloat(value);
+ } else {
+ out.writeBoolean(true);
+ VectorWritable vw = new VectorWritable(vector);
+ vw.setWritesLaxPrecision(true);
+ vw.write(out);
+ }
+ }
+
+ @Override
+ public void readFields(DataInput in) throws IOException {
+ boolean hasVector = in.readBoolean();
+ if (hasVector) {
+ VectorWritable writable = new VectorWritable();
+ writable.readFields(in);
+ set(writable.get());
+ } else {
+ long theUserID = Varint.readSignedVarLong(in);
+ float theValue = in.readFloat();
+ set(theUserID, theValue);
+ }
+ }
+
+ @Override
+ public String toString() {
+ return vector == null ? userID + ":" + value : vector.toString();
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/preparation/PreparePreferenceMatrixJob.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/preparation/PreparePreferenceMatrixJob.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/preparation/PreparePreferenceMatrixJob.java
new file mode 100644
index 0000000..c64ee38
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/preparation/PreparePreferenceMatrixJob.java
@@ -0,0 +1,115 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.preparation;
+
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.cf.taste.hadoop.EntityPrefWritable;
+import org.apache.mahout.cf.taste.hadoop.ToEntityPrefsMapper;
+import org.apache.mahout.cf.taste.hadoop.ToItemPrefsMapper;
+import org.apache.mahout.cf.taste.hadoop.item.ItemIDIndexMapper;
+import org.apache.mahout.cf.taste.hadoop.item.ItemIDIndexReducer;
+import org.apache.mahout.cf.taste.hadoop.item.RecommenderJob;
+import org.apache.mahout.cf.taste.hadoop.item.ToUserVectorsReducer;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.math.VarIntWritable;
+import org.apache.mahout.math.VarLongWritable;
+import org.apache.mahout.math.VectorWritable;
+
+import java.util.List;
+import java.util.Map;
+
+public class PreparePreferenceMatrixJob extends AbstractJob {
+
+ public static final String NUM_USERS = "numUsers.bin";
+ public static final String ITEMID_INDEX = "itemIDIndex";
+ public static final String USER_VECTORS = "userVectors";
+ public static final String RATING_MATRIX = "ratingMatrix";
+
+ private static final int DEFAULT_MIN_PREFS_PER_USER = 1;
+
+ public static void main(String[] args) throws Exception {
+ ToolRunner.run(new PreparePreferenceMatrixJob(), args);
+ }
+
+ @Override
+ public int run(String[] args) throws Exception {
+
+ addInputOption();
+ addOutputOption();
+ addOption("minPrefsPerUser", "mp", "ignore users with less preferences than this "
+ + "(default: " + DEFAULT_MIN_PREFS_PER_USER + ')', String.valueOf(DEFAULT_MIN_PREFS_PER_USER));
+ addOption("booleanData", "b", "Treat input as without pref values", Boolean.FALSE.toString());
+ addOption("ratingShift", "rs", "shift ratings by this value", "0.0");
+
+ Map<String, List<String>> parsedArgs = parseArguments(args);
+ if (parsedArgs == null) {
+ return -1;
+ }
+
+ int minPrefsPerUser = Integer.parseInt(getOption("minPrefsPerUser"));
+ boolean booleanData = Boolean.valueOf(getOption("booleanData"));
+ float ratingShift = Float.parseFloat(getOption("ratingShift"));
+ //convert items to an internal index
+ Job itemIDIndex = prepareJob(getInputPath(), getOutputPath(ITEMID_INDEX), TextInputFormat.class,
+ ItemIDIndexMapper.class, VarIntWritable.class, VarLongWritable.class, ItemIDIndexReducer.class,
+ VarIntWritable.class, VarLongWritable.class, SequenceFileOutputFormat.class);
+ itemIDIndex.setCombinerClass(ItemIDIndexReducer.class);
+ boolean succeeded = itemIDIndex.waitForCompletion(true);
+ if (!succeeded) {
+ return -1;
+ }
+ //convert user preferences into a vector per user
+ Job toUserVectors = prepareJob(getInputPath(),
+ getOutputPath(USER_VECTORS),
+ TextInputFormat.class,
+ ToItemPrefsMapper.class,
+ VarLongWritable.class,
+ booleanData ? VarLongWritable.class : EntityPrefWritable.class,
+ ToUserVectorsReducer.class,
+ VarLongWritable.class,
+ VectorWritable.class,
+ SequenceFileOutputFormat.class);
+ toUserVectors.getConfiguration().setBoolean(RecommenderJob.BOOLEAN_DATA, booleanData);
+ toUserVectors.getConfiguration().setInt(ToUserVectorsReducer.MIN_PREFERENCES_PER_USER, minPrefsPerUser);
+ toUserVectors.getConfiguration().set(ToEntityPrefsMapper.RATING_SHIFT, String.valueOf(ratingShift));
+ succeeded = toUserVectors.waitForCompletion(true);
+ if (!succeeded) {
+ return -1;
+ }
+ //we need the number of users later
+ int numberOfUsers = (int) toUserVectors.getCounters().findCounter(ToUserVectorsReducer.Counters.USERS).getValue();
+ HadoopUtil.writeInt(numberOfUsers, getOutputPath(NUM_USERS), getConf());
+ //build the rating matrix
+ Job toItemVectors = prepareJob(getOutputPath(USER_VECTORS), getOutputPath(RATING_MATRIX),
+ ToItemVectorsMapper.class, IntWritable.class, VectorWritable.class, ToItemVectorsReducer.class,
+ IntWritable.class, VectorWritable.class);
+ toItemVectors.setCombinerClass(ToItemVectorsReducer.class);
+
+ succeeded = toItemVectors.waitForCompletion(true);
+ if (!succeeded) {
+ return -1;
+ }
+
+ return 0;
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/preparation/ToItemVectorsMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/preparation/ToItemVectorsMapper.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/preparation/ToItemVectorsMapper.java
new file mode 100644
index 0000000..5a4144c
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/preparation/ToItemVectorsMapper.java
@@ -0,0 +1,56 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.preparation;
+
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.cf.taste.hadoop.TasteHadoopUtils;
+import org.apache.mahout.math.RandomAccessSparseVector;
+import org.apache.mahout.math.VarLongWritable;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+
+import java.io.IOException;
+
+public class ToItemVectorsMapper
+ extends Mapper<VarLongWritable,VectorWritable,IntWritable,VectorWritable> {
+
+ private final IntWritable itemID = new IntWritable();
+ private final VectorWritable itemVectorWritable = new VectorWritable();
+
+ @Override
+ protected void map(VarLongWritable rowIndex, VectorWritable vectorWritable, Context ctx)
+ throws IOException, InterruptedException {
+ Vector userRatings = vectorWritable.get();
+
+ int column = TasteHadoopUtils.idToIndex(rowIndex.get());
+
+ itemVectorWritable.setWritesLaxPrecision(true);
+
+ Vector itemVector = new RandomAccessSparseVector(Integer.MAX_VALUE, 1);
+ for (Vector.Element elem : userRatings.nonZeroes()) {
+ itemID.set(elem.index());
+ itemVector.setQuick(column, elem.get());
+ itemVectorWritable.set(itemVector);
+ ctx.write(itemID, itemVectorWritable);
+ // reset vector for reuse
+ itemVector.setQuick(elem.index(), 0.0);
+ }
+ }
+
+}
r***@apache.org
2018-06-28 14:54:55 UTC
Permalink
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/images/logos/mahout-logo.svg
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/images/logos/mahout-logo.svg b/community/mahout-mr/mr/src/images/logos/mahout-logo.svg
new file mode 100644
index 0000000..374c89d
--- /dev/null
+++ b/community/mahout-mr/mr/src/images/logos/mahout-logo.svg
@@ -0,0 +1,627 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg version="1.1" id="Layer_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px"
+ width="956px" height="400px" viewBox="0 0 956 400" enable-background="new 0 0 956 400" xml:space="preserve">
+<g>
+ <path fill-rule="evenodd" clip-rule="evenodd" fill="#23A9FF" d="M709.799,389.6c-21.38,0-37.761-6.839-48.688-20.322
+ c-0.377-0.467-0.747-0.935-1.11-1.408V376c0,5.523-4.478,10.001-10.001,10.001h-28.6c-5.522,0-10-4.478-10-10.001v-64.87
+ c0-4.989-0.908-7.693-1.669-9.083c-0.053-0.096-0.104-0.194-0.154-0.292c-0.32-0.634-0.987-1.954-5.366-1.954
+ c-5.29,0-7.384,1.85-8.617,3.464c-2.353,3.07-3.593,8.255-3.593,15.005V376c0,5.523-4.477,10.001-10,10.001h-27.8
+ c-0.756,0-1.492-0.085-2.201-0.244c-0.708,0.159-1.444,0.244-2.2,0.244h-30.271c-3.453,0-6.61-1.776-8.425-4.61
+ c-0.791,0.505-1.595,0.995-2.412,1.471c-7.595,4.351-16.133,6.54-25.442,6.54c-11.384,0-21.145-3.183-29.042-9.469
+ c-1.529,3.569-5.072,6.068-9.198,6.068h-28.408c-5.523,0-10-4.478-10-10.001v-67.812c0-3.194-0.564-4.789-0.9-5.458
+ c-0.392-0.777-0.97-1.93-4.821-1.93c-4.724,0-5.983,1.728-6.896,3.676c-0.919,2.061-1.383,4.79-1.383,8.113V376
+ c0,5.523-4.477,10.001-10,10.001h-27.8c-5.523,0-10-4.478-10-10.001v-63.33c0-6.95-0.88-9.239-1.055-9.627
+ c-0.351-0.763-0.845-1.844-4.675-1.844c-5.691,0-6.793,1.673-7.148,2.329c-0.298,0.616-1.122,2.832-1.122,8.451V376
+ c0,5.523-4.477,10.001-10,10.001h-28.199c-5.523,0-10-4.478-10-10.001V269.8c0-5.522,4.477-10,10-10h26.999
+ c2.902,0,5.514,1.235,7.34,3.209c6.486-3.852,14.321-5.809,23.34-5.809c10.216,0,18.796,2.437,25.504,7.242
+ c0.185,0.133,0.368,0.272,0.545,0.419c1.322,1.091,2.566,2.261,3.73,3.505c2.438-2.188,5.07-4.048,7.884-5.57
+ c0.07-0.037,0.14-0.074,0.211-0.111c7.126-3.639,15.103-5.484,23.707-5.484c5.958,0,11.882,1.164,17.608,3.456
+ c6.131,2.448,11.667,6.673,16.449,12.554c1.573,1.945,2.946,4.052,4.116,6.312c0.939-1.602,1.974-3.131,3.1-4.586
+ C462.511,263.016,477.94,257,499.041,257c13.235,0,25.249,2.715,35.706,8.067c3.12,1.598,6.458,3.872,9.454,7.101v-39.569
+ c0-5.522,4.477-10,10-10h27.8c5.523,0,10,4.478,10,10v28.484c6.504-2.974,13.447-4.483,20.639-4.483
+ c7.865,0,15.192,1.418,21.774,4.218c7.009,3,12.832,7.628,17.329,13.761c2.014,2.758,3.63,5.599,4.846,8.499
+ c1.368-2.145,2.862-4.229,4.481-6.253c10.92-13.683,27.316-20.624,48.729-20.624c21.414,0,37.812,6.941,48.737,20.633
+ c0.225,0.278,0.444,0.562,0.665,0.843v-8.274c0-5.523,4.477-10,10-10h28.6c5.523,0,10,4.477,10,10v64.358
+ c0,6.407,0.92,8.881,1.203,9.484c0.409,0.88,1.098,2.354,5.816,2.354c6.393,0,8.763-2.237,10.312-5.607
+ c0.86-2.016,1.867-5.809,1.867-12.502v-58.088c0-5.523,4.477-10,10-10h28.201c1.719,0,3.338,0.434,4.749,1.198h2.85v-20.001
+ c0-5.522,4.478-10,10.001-10h27.6c5.522,0,10,4.478,10,10V260.6h7.198c5.523,0,10,4.477,10,10v19.602c0,5.523-4.477,10-10,10H920.4
+ v46.178c0.521,0.013,1.106,0.021,1.76,0.021c0.63,0,1.279-0.023,1.929-0.071c0.704-0.053,1.405-0.129,2.085-0.227
+ c0.475-0.067,0.952-0.103,1.427-0.103c2.388,0,4.717,0.856,6.547,2.442c2.192,1.899,3.451,4.658,3.451,7.558v20.8
+ c0,5.347-4.205,9.745-9.545,9.989l-13.179,0.602c-0.037,0.002-0.076,0.004-0.113,0.004c-1.198,0.042-2.364,0.062-3.501,0.062
+ c-14.403,0-24.539-3.26-30.987-9.963c-2.15-2.205-3.846-4.837-5.072-7.872V376c0,5.523-4.478,10.001-10,10.001H838.2
+ c-3.148,0-5.959-1.456-7.791-3.732c-2.405,1.436-4.804,2.577-7.188,3.416c-5.142,1.804-11.065,2.717-17.621,2.717
+ c-24.711,0-35.835-12.303-40.818-22.626c-0.51-1.045-0.984-2.142-1.422-3.292c-1.476,2.343-3.101,4.608-4.874,6.796
+ C747.562,382.761,731.181,389.6,709.799,389.6L709.799,389.6z M487.944,348.278c0.598,0.447,1.538,0.922,3.414,0.922
+ c4.033,0,7.665-1.15,11.099-3.517c1.935-1.333,2.882-4.174,3.318-7.126c-0.231,0.043-0.465,0.089-0.702,0.133l-6.347,1.172
+ c-6.723,1.191-9.018,2.316-9.562,2.634c-0.961,0.561-1.564,1.024-1.564,3.194C487.601,347.181,487.822,347.995,487.944,348.278
+ L487.944,348.278z M709.751,299.801c-6.414,0-9.15,2.51-10.819,4.697c-3.009,3.937-4.531,10.177-4.531,18.552
+ c0,8.386,1.529,14.651,4.544,18.623c1.671,2.205,4.405,4.728,10.807,4.728c6.375,0,9.085-2.51,10.732-4.697
+ c2.995-3.98,4.517-10.259,4.517-18.653c0-8.384-1.515-14.637-4.504-18.585C718.854,302.298,716.139,299.801,709.751,299.801
+ L709.751,299.801z M491.611,300.711c-0.264,0.336-0.564,0.824-0.854,1.53l7.135-0.876c3.8-0.479,5.996-0.97,7.181-1.303
+ c-1.357-0.336-3.556-0.663-6.974-0.663C493.944,299.399,492.062,300.24,491.611,300.711L491.611,300.711z"/>
+ <path fill="#1F1F1F" d="M582,232.6v50.641c4.02-6.2,8.67-10.52,13.96-12.971c5.28-2.449,10.851-3.67,16.681-3.67
+ c6.549,0,12.5,1.141,17.859,3.42c5.35,2.291,9.74,5.78,13.18,10.471c2.91,3.99,4.7,8.08,5.35,12.289
+ c0.65,4.201,0.971,11.07,0.971,20.601V376h-28.6v-64.87c0-5.739-0.971-10.37-2.9-13.89c-2.51-4.961-7.27-7.44-14.29-7.44
+ c-7.271,0-12.79,2.46-16.56,7.39c-3.771,4.92-5.65,11.951-5.65,21.08V376h-27.8V232.6H582 M910.4,240.6v30H927.6V290.2H910.4
+ v56.409c0,4.371,0.55,7.101,1.649,8.17c1.101,1.08,4.47,1.621,10.11,1.621c0.84,0,1.73-0.03,2.67-0.101
+ c0.939-0.069,1.859-0.17,2.77-0.3v20.8l-13.18,0.601c-1.082,0.037-2.135,0.056-3.16,0.056c-11.43,0-19.356-2.298-23.779-6.896
+ c-3.121-3.201-4.681-8.121-4.681-14.761v-65.6H868V270.6h14.8v-30H910.4 M709.8,266.2c18.3,0,31.94,5.62,40.92,16.87
+ c8.99,11.24,13.48,24.539,13.48,39.88c0,15.6-4.49,28.94-13.48,40.03c-8.979,11.08-22.62,16.619-40.92,16.619
+ s-31.94-5.539-40.92-16.619c-8.989-11.09-13.479-24.431-13.479-40.03c0-15.341,4.49-28.64,13.479-39.88
+ C677.859,271.82,691.5,266.2,709.8,266.2 M709.75,356.4c8.12,0,14.359-2.891,18.72-8.68c4.351-5.781,6.53-14.011,6.53-24.671
+ c0-10.659-2.18-18.87-6.53-24.62c-4.36-5.75-10.6-8.63-18.72-8.63c-8.13,0-14.38,2.88-18.77,8.63
+ c-4.391,5.75-6.58,13.961-6.58,24.62c0,10.66,2.189,18.89,6.58,24.671C695.37,353.51,701.62,356.4,709.75,356.4 M499.04,267
+ c11.69,0,22.069,2.32,31.149,6.971c9.07,4.639,13.61,13.369,13.61,26.18v48.76c0,3.38,0.07,7.48,0.2,12.29
+ c0.2,3.63,0.75,6.09,1.67,7.39c0.92,1.301,2.29,2.37,4.13,3.21v4.2h-30.271c-0.84-2.141-1.43-4.141-1.75-6.02
+ c-0.329-1.881-0.59-4.021-0.779-6.41c-3.859,4.17-8.311,7.72-13.34,10.65c-6.02,3.449-12.82,5.18-20.41,5.18
+ c-9.68,0-17.67-2.75-23.98-8.26c-6.31-5.5-9.47-13.301-9.47-23.4c0-13.1,5.08-22.57,15.23-28.44c5.56-3.19,13.75-5.47,24.55-6.84
+ l9.529-1.17c5.17-0.649,8.871-1.47,11.101-2.44c3.99-1.699,5.99-4.34,5.99-7.92c0-4.359-1.53-7.38-4.601-9.039
+ c-3.06-1.66-7.56-2.49-13.5-2.49c-6.66,0-11.379,1.619-14.14,4.869c-1.979,2.4-3.3,5.641-3.96,9.73h-26.8
+ c0.59-9.311,3.2-16.95,7.84-22.939C468.41,271.689,481.08,267,499.04,267 M491.359,359.2c6.07,0,11.66-1.761,16.771-5.28
+ c5.12-3.529,7.771-9.949,7.97-19.279V324.26c-1.779,1.11-3.58,2.01-5.39,2.69c-1.81,0.69-4.3,1.319-7.47,1.909l-6.33,1.17
+ c-5.93,1.051-10.189,2.32-12.77,3.82c-4.361,2.551-6.541,6.49-6.541,11.84c0,4.771,1.339,8.211,4.009,10.33
+ C484.279,358.141,487.529,359.2,491.359,359.2 M411.86,267.2c4.7,0,9.32,0.909,13.89,2.739c4.56,1.82,8.7,5.021,12.41,9.58
+ c3,3.711,5.02,8.271,6.06,13.67c0.65,3.58,0.98,8.82,0.98,15.73L445.01,376H416.6v-67.811c0-4.039-0.66-7.359-1.97-9.959
+ c-2.49-4.961-7.07-7.431-13.75-7.431c-7.73,0-13.07,3.19-16.02,9.58c-1.51,3.38-2.26,7.45-2.26,12.21V376h-27.8v-63.33
+ c0-6.311-0.65-10.9-1.95-13.76c-2.35-5.141-6.94-7.71-13.78-7.71c-7.95,0-13.29,2.569-16.02,7.71c-1.5,2.93-2.25,7.279-2.25,13.07
+ V376h-28.2V269.8h27v15.46c3.44-5.529,6.69-9.47,9.74-11.81c5.39-4.171,12.37-6.25,20.94-6.25c8.12,0,14.68,1.79,19.68,5.37
+ c4.02,3.32,7.08,7.58,9.15,12.779c3.65-6.24,8.18-10.83,13.59-13.76C398.44,268.66,404.82,267.2,411.86,267.2 M865.2,269.4V376h-27
+ v-14.96c-0.261,0.33-0.91,1.3-1.95,2.931c-1.04,1.619-2.28,3.049-3.71,4.289c-4.36,3.9-8.57,6.561-12.64,7.99
+ c-4.07,1.43-8.83,2.15-14.301,2.15c-15.74,0-26.35-5.66-31.81-16.971c-3.06-6.27-4.59-15.5-4.59-27.699V269.4h28.6v64.359
+ c0,6.07,0.71,10.641,2.14,13.711c2.53,5.42,7.49,8.129,14.881,8.129c9.47,0,15.959-3.85,19.459-11.56
+ c1.811-4.181,2.721-9.7,2.721-16.55V269.4H865.2 M582,212.6h-27.8c-11.046,0-20,8.954-20,20v21.182
+ C523.599,249.28,511.796,247,499.04,247c-20.979,0-37.309,5.431-48.668,16.161c-5.107-5.312-10.877-9.27-17.208-11.796
+ c-6.893-2.761-14.068-4.165-21.305-4.165c-10.198,0-19.703,2.213-28.252,6.576c-0.145,0.074-0.289,0.149-0.431,0.227
+ c-0.904,0.49-1.792,1.006-2.664,1.55c-8.252-5.543-18.415-8.353-30.233-8.353c-8.355,0-15.932,1.435-22.647,4.278
+ c-2.458-1.08-5.175-1.679-8.032-1.679h-27c-11.045,0-20,8.954-20,20V376c0,11.046,8.955,20,20,20h28.2
+ c7.177,0,13.472-3.781,17-9.459c3.528,5.678,9.823,9.459,17,9.459h27.8c7.177,0,13.471-3.781,17-9.459
+ c3.528,5.678,9.823,9.459,17,9.459h28.41c3.945,0,7.625-1.143,10.724-3.115c8.044,4.328,17.258,6.516,27.516,6.516
+ c9.591,0,18.534-1.975,26.644-5.875c2.891,1.591,6.19,2.475,9.636,2.475H549.8c0.743,0,1.478-0.04,2.2-0.119
+ c0.723,0.079,1.457,0.119,2.2,0.119H582c9.862,0,18.058-7.139,19.7-16.531c1.643,9.393,9.838,16.531,19.7,16.531H650
+ c6.725,0,12.675-3.318,16.3-8.408c11.611,7.979,26.173,12.008,43.5,12.008c22.084,0,39.678-6.547,52.395-19.475
+ c7.525,9.087,20.741,18.275,43.405,18.275c7.69,0,14.732-1.104,20.93-3.281c0.97-0.341,1.939-0.72,2.908-1.136
+ c2.646,1.292,5.62,2.017,8.763,2.017h27c5.679,0,10.805-2.367,14.445-6.168c7.948,5.119,18.378,7.624,31.614,7.624
+ c1.246,0,2.539-0.022,3.843-0.067c0.076-0.003,0.151-0.006,0.228-0.009l13.18-0.601c10.681-0.487,19.09-9.288,19.09-19.979V356
+ c0-5.798-2.516-11.311-6.896-15.108c-2.94-2.551-6.527-4.16-10.304-4.694v-26.191c9.72-1.362,17.199-9.711,17.199-19.806V270.6
+ c0-10.095-7.479-18.443-17.199-19.806V240.6c0-11.046-8.954-20-20-20H882.8c-11.046,0-20,8.954-20,20v8.801H837
+ c-9.677,0-17.747,6.871-19.601,16.001c-1.852-9.13-9.923-16.001-19.6-16.001h-28.6c-6.813,0-12.833,3.408-16.443,8.612
+ c-3.523-2.381-7.322-4.414-11.38-6.087c-9.217-3.799-19.841-5.726-31.577-5.726s-22.36,1.927-31.577,5.726
+ c-7.925,3.267-14.862,7.909-20.695,13.84c-5.208-6.167-11.636-10.911-19.153-14.131c-0.016-0.007-0.031-0.014-0.047-0.021
+ c-7.824-3.327-16.467-5.015-25.687-5.015c-3.604,0-7.156,0.315-10.641,0.943V232.6C602,221.554,593.046,212.6,582,212.6L582,212.6z
+ M709.75,336.4c-2.254,0-2.562-0.406-2.833-0.764c-0.598-0.787-2.517-3.982-2.517-12.587c0-8.573,1.895-11.722,2.476-12.482
+ c0.263-0.343,0.587-0.768,2.874-0.768c2.241,0,2.542,0.396,2.783,0.715c0.569,0.752,2.467,3.929,2.467,12.535
+ c0,8.638-1.922,11.862-2.511,12.645C712.255,336.006,711.958,336.4,709.75,336.4L709.75,336.4z"/>
+</g>
+<g>
+ <path fill-rule="evenodd" clip-rule="evenodd" fill="#FBE500" d="M293.499,388c-14.734,0-16.194-10.602-16.491-15.158
+ c-2.282,0.969-5.548,2.491-8.354,3.799C254.849,383.077,243.715,388,236.501,388c-25.962,0-44.167-21.608-49.721-41.42
+ c-0.496,1.273-1.104,2.537-1.848,3.777l-0.259,0.435l-0.316,0.395c-8.148,10.178-36.573,10.815-36.855,10.815
+ c-13.224,0-22.923-3.371-28.833-10.016c-3.175-3.571-6.704-9.897-5.67-19.862c-0.078-13.16,4.078-39.976,7.317-50.777l1.603-5.348
+ h5.582h11h3.107l2.196,2.198c2.883,2.884,2.607,6.303,2.405,8.801c-0.188,2.295-0.534,6.566-0.213,15.226
+ c0.097,2.288,2.599,9.209,5.632,13.571c2.909-2.997,8.484-10.194,18.782-27.42c1.031-1.728,1.504-2.515,1.852-3.035l4.313-6.47
+ c-2.459-5.739-5.026-12.353-5.562-21.952L171,256.709V256.5c0-1.622,0.274-3.164,0.536-4.655c0.063-0.361,0.141-0.804,0.208-1.224
+ c-1.643-1.129-3.839-2.151-6.13-3.219c-2.105-0.981-4.286-1.998-6.391-3.253c-0.369-0.209-0.732-0.424-1.084-0.646
+ c0.54,1.213,0.863,2.522,0.863,3.995c0,3.938-4.782,14.329-8.794,22.355l-1.475,2.951l-3.172,0.907
+ c-4.74,1.354-14.825,1.835-22.685,1.835c-3.458,0-7.982-0.087-12.876-0.411v1.362c0,1.262,0.243,3.584,0.437,5.449
+ c0.245,2.333,0.395,3.824,0.395,5.052c0,9.625-4.9,16.854-13.795,20.354c-5.909,2.326-12.401,2.647-18.535,2.647
+ c-14.37,0-22.193-2.224-27.005-7.674c-4.932-5.586-4.944-12.661-4.959-20.85c-0.002-1.473-0.004-3.027-0.036-4.666
+ c-0.019-0.987,0.051-4.084,0.19-9.929c0.137-5.841,0.308-13.11,0.308-16.382v-21.006c-4.691-11.945-6.906-23.596-7.927-30.968
+ c-1.042-7.547,0.479-14.028,4.519-19.263c2.712-3.514,6.315-6.115,10.41-8.083V167.5c0-4.225,0-8.547,0.348-12.964
+ c-0.274-0.088-0.551-0.179-0.829-0.27c-7.124-2.318-15.989-5.206-21.714-11.884c-9.206-10.842-14.806-37.737-14.806-40.882
+ c0-9.415,5.693-15.5,14.502-15.5c9.336,0,14.5,8.575,14.5,14.5c0,2.35-0.814,5.752-2.542,12.427
+ c-0.538,2.071-1.259,4.855-1.454,5.917c0.127,5.01,3.023,8.396,5.461,10.37c3.111,2.514,7.279,4.155,11.751,4.676
+ c17.654-45.552,69.792-61.89,110.282-61.89c50.339,0,81.613,26.563,86.226,73.025c15.616-5.543,33.031-11.026,46.774-11.026
+ c10.264,0,22.501,4.947,22.501,28.502c0,26.979-14.823,65.564-47.938,90.951l-5.499,4.217l-4.639-5.151
+ c-6.05-6.721-13.757-10.396-24.254-11.563l-1.745-0.194c0.874,3.85,2.272,7.381,3.797,11.229c1.422,3.59,2.945,7.434,4.069,11.783
+ l0.006-0.038l10.701,14.268c6.913,9.214,14.502,33.55,14.502,46.5c0,0.402-0.011,0.822-0.036,1.257
+ c3.445-4.229,8.915-6.759,15.534-6.759c13.399,0,19.501,8.554,19.501,16.5c0,3.264-1.628,6.606-4.312,11.725
+ c-0.299,0.573-0.668,1.275-1.004,1.937c0.4,0.484,0.85,1.01,1.234,1.457c3.217,3.753,8.081,9.421,8.081,16.884
+ C313,379.379,304.799,388,293.499,388L293.499,388z M246.438,356.085c-0.279,0.348-0.393,0.734-0.435,1.228
+ C246.151,356.929,246.297,356.518,246.438,356.085L246.438,356.085z M270.053,335.944c-1.209,1.354-2.772,2.58-4.778,3.571
+ c1.533-0.104,3.139-0.207,4.788-0.296c-0.04-0.548-0.065-1.122-0.065-1.719C269.998,336.974,270.017,336.455,270.053,335.944
+ L270.053,335.944z M219.022,317.98c0.091,0.007,0.192,0.013,0.299,0.017c0.586-0.088,1.987-0.419,2.938-0.646
+ c0.477-0.113,0.958-0.226,1.438-0.337c-1.721,0.031-3.757,0.146-4.62,0.546C219.061,317.656,219.037,317.793,219.022,317.98
+ L219.022,317.98z M172.535,125.259c8.01,5.611,15.055,13.589,20.567,20.67c2.555-14.029,4.93-23.667,8.843-29.008
+ c-5.7,1.628-9.896,5.062-12.694,7.354c-2.441,2-4.55,3.727-7.75,3.727c-2.044,0-3.801-0.7-6.71-1.858
+ C174.113,125.873,173.356,125.571,172.535,125.259L172.535,125.259z"/>
+ <path fill="#1F1F1F" d="M169.5,79.5c36,0,75,15,79,69h-3c-5-28-16-40-37-40c-16,0-25,12-27,12s-12.5-6-23-6c-21,0-43,12-42,42
+ l-55,11c0-6,0-12,1-18c-7-3-19-5-25-12c-7.5-8.83-13-34-13-36c0-6,3-8,7-8c5,0,7,5,7,7c0,3-4,16-4,18
+ c0,13.355,12.737,23.069,27.8,23.069c0.728,0,1.463-0.023,2.2-0.069C79.5,93.5,134.5,79.5,169.5,79.5 M213.537,119.277
+ c18.366,0.001,22.214,25.926,26.963,39.223c17-6,44-17,62-17c13,0,15,11,15,21c0,26-15,62-45,85c-9-10-20-13-29-14
+ c8.5-20.5,10.83-49,1-49c-6,0-3,11-4,14c-2,11-8,32-8,34c0,18,10.5,26.5,10.5,44.5c0,3-4.5,22.5-7.5,33.5c-3-1-8-1-10-1
+ c-6,0-14,0-14,9c0,6,5,7,8,7c2,0,8-2,11-2c2,0,6,1,6,5c0,10-20,4-20,16c0,6,3,7,6,7c2,0,18.01-9.73,21-10
+ c0.204-0.019,0.396-0.027,0.579-0.027c4.739,0,2.421,6.027,2.421,6.027c-8.83,3.5-8,9-8,12c0,5,3,8,6,8c10,0,11-19,11-20
+ c6,0,14-1,22-1c1-3,0-6,0-9c0-8,6-11,12-11c8,0,12,4,12,9c0,3-6,12-6,14c0,4,10,10,10,18s-5,13-12,13c-16,0-3-16-15-16
+ c-4,0-32,16-42,16c-27,0-44-28-44-46v-22c-1-7-2-16-4-23c-3-12-9.17-18.17-10-33c0-3,2-8,0-10c-4-4-10.5-5.83-15.5-8.83
+ c-9-5-11.5-16.17-13.5-21.17c-1-4-3-7-6-11c-7,3-6,9-6,13c0,18,14,25,14,29c0,2-5,13-8,19c-3.04,0.868-11.171,1.549-20.627,1.549
+ c-12.319,0-26.887-1.154-35.373-4.549c-29-10-38.26-46.189-41-66C43.67,177,65.83,174.17,84,172c12.6-1.5,31.5-4.5,45.5-6.5
+ c0,0,1,0,1-2c0-3-2-11-2-13v-6c0-10,12.5-19,24-19c20.17,0,40,33,45,39c3.5-20.17,6.83-43.83,13-45
+ C211.555,119.349,212.566,119.277,213.537,119.277 M54.5,250.5c10.601,13.491,30.487,26.055,46.237,26.055
+ c0.593,0,1.182-0.019,1.763-0.055c0,3,0.83,8.5,0.83,10.5c0,15-15.83,15.5-24.83,15.5c-27,0-24.17-8.17-24.5-25.83
+ C53.96,274.67,54.5,256.5,54.5,250.5 M253.5,282.5c6,8,13,31,13,42c0,8-6,10-14,10c-7,0-7-9-7-13
+ C245.5,318.5,251.5,295.5,253.5,282.5 M138.5,283.5c1,1-0.59,3.01,0,19c0.17,4.5,4.83,17.17,11,22
+ c0.394,0.309,0.843,0.454,1.342,0.454c7.473,0,25.783-32.642,27.658-35.454l3,41c0,5,0,11-3,16c-4,5-22,8-31,8c-15,0-29-5-27-22
+ c-0.17-12.17,4-39,7-49H138.5 M169.5,64.5c-22.887,0-47.102,5.267-66.436,14.451c-22.318,10.602-38.762,26.385-48.174,46.081
+ c-2.892-1.323-4.917-3.379-5.317-5.69c0.286-1.215,0.786-3.146,1.146-4.538c1.934-7.468,2.781-11.078,2.781-14.303
+ c0-10.625-8.84-22-22-22c-12.953,0-22,9.458-22,23c0,5.403,4.153,19.196,4.33,19.781c3.642,12.04,7.645,20.521,12.238,25.929
+ l0.022,0.026l0.021,0.025c5.737,6.693,13.633,10.188,20.458,12.587c-0.062,2.329-0.068,4.619-0.069,6.88
+ c-3.329,2.099-6.335,4.7-8.847,7.953c-3.655,4.735-7.666,12.894-6.012,24.87c1.152,8.331,3.418,19.827,7.859,31.553V250.5
+ c0,3.185-0.17,10.406-0.308,16.209c-0.158,6.708-0.211,9.153-0.189,10.261c0.029,1.536,0.031,3.052,0.034,4.518
+ c0.016,8.896,0.031,18.095,6.835,25.802C53.794,316.263,66.235,317.5,78.5,317.5c6.544,0,14.191-0.376,21.283-3.167
+ c2.781-1.094,5.281-2.484,7.479-4.137c-1.056,8.09-1.759,15.937-1.766,21.561c-1.177,12.446,3.429,20.561,7.567,25.214
+ c7.394,8.313,18.98,12.529,34.438,12.529c5.904,0,13.821-0.954,20.661-2.489c6.875-1.544,12.2-3.518,16.228-6.052
+ c2.301,4.51,5.13,8.851,8.412,12.832C204.34,387.79,219.86,395.5,236.5,395.5c8.772,0,20.174-4.999,35.324-12.061
+ c0.02-0.01,0.04-0.019,0.06-0.028c0.447,0.926,0.981,1.858,1.621,2.783c2.932,4.245,8.782,9.306,19.996,9.306
+ c7.6,0,14.536-2.912,19.53-8.2c4.817-5.101,7.47-12.132,7.47-19.8c0-8.514-4.28-14.937-7.848-19.338
+ c2.113-4.158,3.848-8.218,3.848-12.662c0-11.927-9.274-24-27-24c-3.298,0-6.405,0.485-9.255,1.394
+ c-2.485-13.582-8.349-30.865-14.745-39.394l-9.87-13.159c-0.968-3.414-2.118-6.49-3.218-9.3c3.468,1.514,6.374,3.645,8.938,6.493
+ l9.274,10.305l11.002-8.435C316.77,232.461,332.5,191.32,332.5,162.5c0-5.601-0.454-13.9-4.378-21.287
+ c-5.04-9.488-14.14-14.713-25.622-14.713c-12.294,0-26.813,3.88-40.602,8.463c-1.801-9.966-4.853-19.031-9.12-27.063
+ c-5.635-10.608-13.4-19.48-23.079-26.371C214.048,70.389,193.232,64.5,169.5,64.5L169.5,64.5z M153.054,279.371l0.912-0.261
+ l2.951-5.902c1.771-3.542,3.868-8.042,5.472-11.744c0.449-1.035,0.853-1.989,1.216-2.874c0.6,8.092,2.501,14.302,4.513,19.442
+ l-2.098,3.147c-0.447,0.67-0.922,1.462-2.049,3.348c-4.393,7.349-7.832,12.72-10.507,16.643c-0.255-7.689,0.052-11.492,0.22-13.565
+ C153.833,285.754,154.081,282.688,153.054,279.371L153.054,279.371z"/>
+</g>
+<g>
+ <path fill-rule="evenodd" clip-rule="evenodd" fill="#23A9FF" d="M445.01,377.502H416.6c-0.828,0-1.501-0.673-1.501-1.501v-67.812
+ c0-3.775-0.607-6.899-1.808-9.283c-2.233-4.446-6.292-6.605-12.412-6.605c-7.158,0-11.952,2.849-14.657,8.708
+ c-1.406,3.146-2.121,7.051-2.121,11.583v63.41c0,0.828-0.673,1.501-1.501,1.501h-27.8c-0.828,0-1.501-0.673-1.501-1.501v-63.33
+ c0-6.069-0.609-10.49-1.816-13.142c-2.1-4.593-6.162-6.828-12.414-6.828c-7.419,0-12.225,2.26-14.695,6.912
+ c-1.373,2.681-2.073,6.848-2.073,12.368v64.02c0,0.828-0.673,1.501-1.501,1.501h-28.202c-0.828,0-1.501-0.673-1.501-1.501V269.8
+ c0-0.828,0.673-1.501,1.501-1.501h27.001c0.828,0,1.501,0.673,1.501,1.501v10.492c2.533-3.545,4.988-6.237,7.326-8.03
+ c5.624-4.353,12.977-6.562,21.853-6.562c8.402,0,15.317,1.902,20.551,5.65c0.03,0.02,0.057,0.04,0.082,0.063
+ c3.509,2.895,6.334,6.504,8.422,10.749c3.508-5.25,7.753-9.242,12.649-11.891c5.95-3.04,12.626-4.572,19.875-4.572
+ c4.873,0,9.735,0.959,14.446,2.849c4.774,1.902,9.153,5.276,13.018,10.025c3.147,3.89,5.287,8.71,6.37,14.331
+ c0.668,3.688,1.007,9.069,1.007,16.015l-0.189,67.085C446.507,376.831,445.836,377.502,445.01,377.502L445.01,377.502z"/>
+ <path fill="#1F1F1F" d="M411.86,267.2c4.7,0,9.32,0.909,13.89,2.739c4.56,1.82,8.7,5.021,12.41,9.58c3,3.711,5.02,8.271,6.06,13.67
+ c0.65,3.58,0.98,8.82,0.98,15.73L445.01,376H416.6v-67.811c0-4.039-0.66-7.359-1.97-9.959c-2.49-4.961-7.07-7.431-13.75-7.431
+ c-7.73,0-13.07,3.19-16.02,9.58c-1.51,3.38-2.26,7.45-2.26,12.21V376h-27.8v-63.33c0-6.311-0.65-10.9-1.95-13.76
+ c-2.35-5.141-6.94-7.71-13.78-7.71c-7.95,0-13.29,2.569-16.02,7.71c-1.5,2.93-2.25,7.279-2.25,13.07V376h-28.2V269.8h27v15.46
+ c3.44-5.529,6.69-9.47,9.74-11.81c5.39-4.171,12.37-6.25,20.94-6.25c8.12,0,14.68,1.79,19.68,5.37c4.02,3.32,7.08,7.58,9.15,12.779
+ c3.65-6.24,8.18-10.83,13.59-13.76C398.44,268.66,404.82,267.2,411.86,267.2 M411.86,264.2c-7.485,0-14.391,1.587-20.523,4.718
+ c-0.022,0.011-0.043,0.022-0.065,0.034c-4.465,2.418-8.405,5.893-11.758,10.363c-2.029-3.501-4.587-6.534-7.643-9.058
+ c-0.053-0.045-0.108-0.087-0.164-0.127c-5.497-3.936-12.706-5.931-21.427-5.931c-9.215,0-16.878,2.313-22.776,6.877
+ c-1.614,1.238-3.242,2.832-4.904,4.808V269.8c0-1.657-1.343-3-3-3h-27c-1.657,0-3,1.343-3,3V376c0,1.657,1.343,3,3,3h28.2
+ c1.657,0,3-1.343,3-3v-64.02c0-5.276,0.646-9.214,1.92-11.703c2.165-4.076,6.539-6.077,13.35-6.077
+ c5.682,0,9.194,1.893,11.052,5.957c0.764,1.682,1.678,5.222,1.678,12.513V376c0,1.657,1.343,3,3,3h27.8c1.657,0,3-1.343,3-3v-63.41
+ c0-4.321,0.672-8.018,1.999-10.986c2.453-5.313,6.678-7.804,13.281-7.804c5.574,0,9.091,1.835,11.069,5.776
+ c1.097,2.176,1.651,5.072,1.651,8.613V376c0,1.657,1.343,3,3,3h28.41c1.653,0,2.996-1.338,3-2.991l0.19-67.08
+ c0-7.044-0.346-12.517-1.028-16.275c-1.136-5.897-3.381-10.94-6.679-15.02c-4.031-4.955-8.615-8.479-13.631-10.48
+ C421.97,265.194,416.922,264.2,411.86,264.2L411.86,264.2z"/>
+</g>
+<g>
+ <g>
+ <path fill-rule="evenodd" clip-rule="evenodd" fill="#23A9FF" d="M170,62c10.33,0,14-3.67,28.67-13
+ c22.66,14.21-2.84,34.11,28.33,54c-7.33-1.65-15.33-4.33-21,1.5c-8.5,0-8.83-6.97-14.5-15.5c-8.5,5.68,29.5,34.67-22.67,42.26
+ c-28.03,4.09-8.5-17.05-36.83-34.1c0,0-2.83,0-5.67,2.84c2.84,2.84,15.17,12,15.17,12c-15-3.67-25.67-2.89-28.5,17
+ c-2.83-5.68-5.67-12.04-5.67-20.56c0-14.21,6.67-59.11,29.34-59.11C142.33,49.33,159.67,62,170,62z"/>
+ <path fill="none" stroke="#000000" stroke-width="3" stroke-linecap="round" stroke-linejoin="round" d="M170,62
+ c10.33,0,14-3.67,28.67-13c22.66,14.21-2.84,34.11,28.33,54c-7.33-1.65-15.33-4.33-21,1.5c-8.5,0-8.83-6.97-14.5-15.5
+ c-8.5,5.68,29.5,34.67-22.67,42.26c-28.03,4.09-8.5-17.05-36.83-34.1c0,0-2.83,0-5.67,2.84c2.84,2.84,15.17,12,15.17,12
+ c-15-3.67-25.67-2.89-28.5,17c-2.83-5.68-5.67-12.04-5.67-20.56c0-14.21,6.67-59.11,29.34-59.11C142.33,49.33,159.67,62,170,62z"
+ />
+ </g>
+ <defs>
+ <filter id="MyOpacityMaskFilter" filterUnits="userSpaceOnUse" x="105.83" y="47.5" width="122.67" height="85.774">
+
+ <feColorMatrix type="matrix" values="-1 0 0 0 1 0 -1 0 0 1 0 0 -1 0 1 0 0 0 1 0" color-interpolation-filters="sRGB" result="source"/>
+ </filter>
+ </defs>
+ <mask maskUnits="userSpaceOnUse" x="105.83" y="47.5" width="122.67" height="85.774" id="SVGID_1_">
+ <g filter="url(#My_OpacityMaskFilter)">
+
+ <image overflow="visible" width="128" height="91" xlink:href="data:image/jpeg;base64,/9j/4AAQSkZJRgABAgEASABIAAD/7AARRHVja3kAAQAEAAAAHgAA/+4AIUFkb2JlAGTAAAAAAQMA
+EAMCAwYAAAItAAADjQAABP//2wCEABALCwsMCxAMDBAXDw0PFxsUEBAUGx8XFxcXFx8eFxoaGhoX
+Hh4jJSclIx4vLzMzLy9AQEBAQEBAQEBAQEBAQEABEQ8PERMRFRISFRQRFBEUGhQWFhQaJhoaHBoa
+JjAjHh4eHiMwKy4nJycuKzU1MDA1NUBAP0BAQEBAQEBAQEBAQP/CABEIAFsAgAMBIgACEQEDEQH/
+xACNAAEAAgMBAQAAAAAAAAAAAAAABQcBBAYCAwEBAAAAAAAAAAAAAAAAAAAAABAAAgICAQQCAwEB
+AAAAAAAAAwQBAgUGABAgERMwElAxFEAWEQABAwIEBAUEAwAAAAAAAAABABECIQMgMUESEFFhIjBx
+gTIEQJGhQlJiFBIBAAAAAAAAAAAAAAAAAAAAUP/aAAwDAQACEQMRAAAAr8GZad70qyHvKHKfdZzp
+qvewam91PYlQa1oVofICXiLCOv38ZGMj56MkITakR49hqVDclRECD6XBVlxm4AAAA8/M91ZavGlZ
+M4J+26rtU9cl0VaFjyNMWmSrGQDU4GxqyO7ia/1Dai/WCc7ist024jWHrrOR2y8fpEypljyZr7qq
+1IIAD15AAHV9PVosuF44b+gAAH//2gAIAQIAAQUA/If/2gAIAQMAAQUA/If/2gAIAQEAAQUA6Vra
+8p646zB9UdHVhRha3apiGmYcQOpbsiJmdX1z7wrjABpdIF4yWtLM1yulmFLGNdXn0m4tjHWbYXTJ
+mVsCAQ9hwI7hZBZc/XXcf/a5i0qLg6kCMkHwqpuf80n5BhVQ8oKlI5kBQRfZQ1Fkeuk42KirERHw
+sR5Dt8eMl0WH7T60rAVfiJHmm8LTRnpgQ+7JYwfrW+C1orA2wFn983LGwwC1ZpbmoBm761fqEl4H
+RzeFV3sdmAOVifPbkq2sshkzY3Jr5gVxZnJAJTKgHcn65pcxDILR6n2xUFsaYTFw+aYxjGGyg3Qd
+haxYe5qSIwNgbENjItsW9pOTMzzVmKhZYz1FlsptbbNyZBonLEtfml5a4yhJBB9bT4ru9qyLsRPI
+D5R+5R9cWzKzuEdqZfpctKRk80EI9izH9pe215t2RMxOC2iFqj3FX6s7utTju72vDuYccn/L/9oA
+CAECAgY/AEP/2gAIAQMCBj8AQ//aAAgBAQEGPwDgIxBJOQCEiNoK3Rr5hbb0DHrpi3CJjHRNcHbz
+wgDM5KN67F5SqgNoTGIR7AXRn8an9dE1y1KmoDr2S+xQFu0WOpDKNz5A3S6oR2gKXbop2pfqfxgB
+IeMD+VFg1MDSDqsQvYFSITRDcJPyUm/bP0wRuSFZVKAGnhS8l6Hjbt/ykAoUZh4ch0UbrasTxthn
+EaqI6eDukWATQkCeE2FRUIxkGILHgZaBgojojM6I/FJ7oljyHqgYyBfFIRzZXPjXpkwlIygZF8zU
+VKBJGSkDII3LWevCXmFGuilEkKV22wm+aEZyJtPXookF3GGQ6IfIt0lAu4Ww16omdwsdAm3FVUnN
+XBW4yZgpRslov7iu+bruX+acssn5ISGuAkqbYRJ2BoULYNDngt3HYOx9VGunF5FSAkEbcC4epxVw
+OMwo27p2kc1W4PumFwP5oi05KO+TROg+m//Z" transform="matrix(1 0 0 1 103 45)">
+ </image>
+ </g>
+ </mask>
+ <g mask="url(#SVGID_1_)">
+ <path fill-rule="evenodd" clip-rule="evenodd" fill="#043C96" d="M170,62c10.33,0,14-3.67,28.67-13
+ c22.66,14.21-2.84,34.11,28.33,54c-7.33-1.65-15.33-4.33-21,1.5c-8.5,0-8.83-6.97-14.5-15.5c-8.5,5.68,29.5,34.67-22.67,42.26
+ c-28.03,4.09-8.5-17.05-36.83-34.1c0,0-2.83,0-5.67,2.84c2.84,2.84,15.17,12,15.17,12c-15-3.67-25.67-2.89-28.5,17
+ c-2.83-5.68-5.67-12.04-5.67-20.56c0-14.21,6.67-59.11,29.34-59.11C142.33,49.33,159.67,62,170,62z"/>
+ <path fill="none" stroke="#043C96" stroke-width="3" stroke-linecap="round" stroke-linejoin="round" d="M170,62
+ c10.33,0,14-3.67,28.67-13c22.66,14.21-2.84,34.11,28.33,54c-7.33-1.65-15.33-4.33-21,1.5c-8.5,0-8.83-6.97-14.5-15.5
+ c-8.5,5.68,29.5,34.67-22.67,42.26c-28.03,4.09-8.5-17.05-36.83-34.1c0,0-2.83,0-5.67,2.84c2.84,2.84,15.17,12,15.17,12
+ c-15-3.67-25.67-2.89-28.5,17c-2.83-5.68-5.67-12.04-5.67-20.56c0-14.21,6.67-59.11,29.34-59.11C142.33,49.33,159.67,62,170,62z"
+ />
+ </g>
+</g>
+<g>
+ <path fill-rule="evenodd" clip-rule="evenodd" fill="#FBE500" d="M293.5,382c-9.998,0-10.315-5.942-10.546-10.279
+ c-0.217-4.07-0.465-5.721-4.453-5.721c-1.218,0-7.149,2.766-12.382,5.203C255.8,376.014,242.957,382,236.5,382
+ c-12.534,0-24.353-5.965-33.282-16.796C195.682,356.062,191,344.297,191,334.499v-21.89c-0.17-1.201-0.341-2.459-0.518-3.752
+ c-0.845-6.225-1.805-13.276-3.424-18.945c-1.138-4.55-2.757-8.294-4.324-11.914c-2.56-5.912-5.206-12.029-5.732-21.414
+ c-0.002-1.18,0.212-2.402,0.442-3.695c0.355-2.016,0.799-4.522-0.004-5.328c-2.376-2.377-5.892-4.014-9.292-5.598
+ c-1.994-0.93-4.056-1.889-5.919-3.005c-8.018-4.455-11.089-13.294-13.123-19.146c-0.37-1.066-0.69-1.987-0.997-2.755l-0.038-0.095
+ l-0.025-0.1c-0.816-3.267-2.352-5.857-5.008-9.474c-4.247,2.344-4.152,6.092-4.06,9.727c0.013,0.481,0.023,0.944,0.023,1.384
+ c0,11.657,6.152,18.462,10.225,22.965c2.191,2.423,3.775,4.175,3.775,6.034c0,3.166-8.077,19.509-8.159,19.671l-0.296,0.592
+ l-0.633,0.181c-3.363,0.961-11.819,1.606-21.042,1.606c-7.303,0-25.421-0.454-35.926-4.656
+ c-30.922-10.66-39.625-50.538-41.929-67.187c-0.814-5.892,0.305-10.864,3.325-14.776c6.96-9.015,22.775-10.902,35.482-12.418
+ c8.487-1.01,19.755-2.69,30.65-4.316c5.071-0.757,10.019-1.493,14.48-2.133c0.025-0.116,0.048-0.296,0.048-0.562
+ c0-1.51-0.598-4.632-1.125-7.385c-0.542-2.835-0.875-4.625-0.875-5.616v-6.001c0-11.356,13.95-20.5,25.5-20.5
+ c17.761,0,34.676,23.646,42.804,35.009c0.467,0.654,0.904,1.262,1.304,1.819c0.164-0.953,0.326-1.91,0.488-2.869
+ c4.085-24.071,7.006-38.771,13.125-39.933c1.174-0.168,2.268-0.248,3.317-0.248c16.308,0,21.873,18.76,25.937,32.459
+ c0.671,2.254,1.311,4.413,1.952,6.341c2.131-0.759,4.403-1.588,6.779-2.457C264.544,148.163,286.92,140,302.5,140
+ c16.501,0,16.501,16.934,16.501,22.5c0,25.503-14.097,62.045-45.589,86.19l-1.1,0.843l-0.928-1.03
+ c-6.994-7.771-16.168-12.191-28.05-13.513l-1.984-0.221l0.764-1.845c7.093-17.106,9.554-38.674,5.162-45.25
+ c-0.763-1.145-1.647-1.677-2.776-1.677c-0.789,0-1.146,0.278-1.346,0.486c-1.222,1.269-1.085,4.924-0.984,7.593
+ c0.074,1.938,0.139,3.62-0.208,4.779c-1.132,6.178-3.464,15.332-5.345,22.691c-1.271,4.979-2.585,10.13-2.617,10.963
+ c0,8.704,2.499,15.01,5.145,21.688c2.633,6.646,5.355,13.515,5.355,22.801c0,3.303-4.705,23.461-7.551,33.896l-0.417,1.529
+ l-1.504-0.501C232.255,311,227.348,311,225.499,311c-7.319,0-12.5,0.539-12.5,7.499c0,4.545,3.536,5.5,6.501,5.5
+ c0.724,0,2.461-0.41,4.142-0.808c2.474-0.585,5.031-1.19,6.857-1.19c3.014,0,7.5,1.731,7.5,6.5c0,5.946-5.555,7.321-10.456,8.535
+ c-5.938,1.47-9.543,2.707-9.543,7.465c0,5.075,2.224,5.5,4.5,5.5c0.845-0.146,5.368-2.56,8.67-4.322
+ c6.417-3.424,10.441-5.515,12.195-5.673c0.25-0.022,0.488-0.033,0.711-0.033c2.091,0,3.172,0.936,3.71,1.721
+ c1.59,2.315,0.269,5.939,0.114,6.346l-0.238,0.614l-0.61,0.241c-7.2,2.854-7.12,6.903-7.063,9.859
+ c0.006,0.263,0.011,0.511,0.011,0.746c0,4.068,2.289,6.5,4.499,6.5c8.643,0,9.501-18.314,9.501-18.5v-1.499h1.5
+ c2.734,0,5.946-0.217,9.348-0.444c3.719-0.248,7.553-0.507,11.48-0.551c0.231-1.382,0.072-2.827-0.097-4.339
+ c-0.113-1.024-0.231-2.083-0.231-3.166c0-9.228,7.274-12.5,13.502-12.5c9.963,0,13.5,5.655,13.5,10.5
+ c0,1.88-1.435,4.758-3.625,8.935c-0.976,1.864-2.313,4.413-2.376,5.091c0,1.074,1.71,3.068,3.363,4.997
+ c2.957,3.445,6.636,7.734,6.636,12.976C306.999,376.174,301.574,382,293.5,382L293.5,382z"/>
+ <g>
+ <path fill="#1F1F1F" d="M213.538,119.277c18.366,0.001,22.213,25.926,26.962,39.223c17-6,44-17,62-17c13,0,15,11,15,21
+ c0,26-15,62-45,85c-9-10-20-13-29-14c8.5-20.5,10.83-49,1-49c-6,0-3,11-4,14c-2,11-8,32-8,34c0,18,10.5,26.5,10.5,44.5
+ c0,3-4.5,22.5-7.5,33.5c-3-1-8-1-10-1c-6,0-14,0-14,9c0,6,5,7,8,7c2,0,8-2,11-2c2,0,6,1,6,5c0,10-20,4-20,16c0,6,3,7,6,7
+ c2,0,18.01-9.73,21-10c0.204-0.019,0.396-0.027,0.579-0.027c4.739,0,2.421,6.027,2.421,6.027c-8.83,3.5-8,9-8,12c0,5,3,8,6,8
+ c10,0,11-19,11-20c6,0,14-1,22-1c1-3,0-6,0-9c0-8,6-11,12-11c8,0,12,4,12,9c0,3-6,12-6,14c0,4,10,10,10,18s-5,13-12,13
+ c-16,0-3-16-15-16c-4,0-32,16-42,16c-27,0-44-28-44-46v-22c-1-7-2-16-4-23c-3-12-9.17-18.17-10-33c0-3,2-8,0-10
+ c-4-4-10.5-5.83-15.5-8.83c-9-5-11.5-16.17-13.5-21.17c-1-4-3-7-6-11c-7,3-6,9-6,13c0,18,14,25,14,29c0,2-5,13-8,19
+ c-3.04,0.868-11.171,1.549-20.627,1.549c-12.319,0-26.887-1.154-35.373-4.549c-29-10-38.26-46.189-41-66
+ C43.67,177,65.83,174.17,84,172c12.6-1.5,31.5-4.5,45.5-6.5c0,0,1,0,1-2c0-3-2-11-2-13v-6c0-10,12.5-19,24-19c20.17,0,40,33,45,39
+ c3.5-20.17,6.83-43.83,13-45C211.555,119.349,212.566,119.277,213.538,119.277 M213.538,116.277L213.538,116.277
+ c-1.121,0-2.285,0.085-3.462,0.253l-0.067,0.009l-0.067,0.013c-7.154,1.356-10.092,16.252-14.208,40.478
+ c-8.547-11.923-25.273-34.53-43.232-34.53c-6.25,0-12.861,2.322-18.139,6.37c-5.631,4.32-8.861,10.017-8.861,15.63v6
+ c0,1.128,0.326,2.887,0.902,5.898c0.415,2.168,0.916,4.785,1.058,6.364c-4.108,0.593-8.54,1.254-13.201,1.949
+ c-10.889,1.624-22.148,3.302-30.614,4.31c-12.988,1.551-29.15,3.481-36.493,12.993c-3.275,4.243-4.495,9.591-3.625,15.896
+ c1.349,9.753,4.34,24.19,10.932,37.593c7.76,15.777,18.523,26.143,31.994,30.81c10.756,4.273,29.043,4.736,36.418,4.736
+ c9.348,0,17.968-0.669,21.452-1.664l1.269-0.362l0.59-1.181c0.34-0.68,8.317-16.676,8.317-20.342c0-2.437-1.747-4.369-4.165-7.043
+ c-3.916-4.332-9.835-10.879-9.835-21.957c0-0.452-0.012-0.929-0.024-1.423c-0.087-3.454,0.041-5.904,2.188-7.644
+ c2.064,2.912,3.25,5.088,3.926,7.794l0.05,0.197l0.075,0.189c0.294,0.734,0.609,1.641,0.973,2.689
+ c1.976,5.687,5.281,15.197,13.81,19.963c1.919,1.147,4.002,2.118,6.018,3.057c3.399,1.584,6.611,3.08,8.799,5.234
+ c0.252,0.677-0.136,2.876-0.347,4.069c-0.23,1.3-0.467,2.645-0.467,3.873v0.084l0.005,0.084c0.54,9.651,3.24,15.891,5.851,21.924
+ c1.614,3.729,3.138,7.252,4.234,11.636l0.012,0.049l0.014,0.048c1.589,5.56,2.54,12.55,3.378,18.716
+ c0.172,1.267,0.34,2.497,0.507,3.673V334.5c0,10.129,4.813,22.26,12.56,31.658c9.218,11.183,21.45,17.342,34.44,17.342
+ c6.791,0,19.8-6.064,30.254-10.938c4.641-2.163,10.408-4.851,11.819-5.062c2.478,0.006,2.669,0.32,2.882,4.301
+ c0.219,4.089,0.626,11.699,12.044,11.699c8.832,0,15-6.579,15-16c0-5.797-3.88-10.319-6.997-13.953
+ c-1.082-1.262-2.686-3.131-2.97-3.964c0.292-0.864,1.411-2.999,2.171-4.449c2.362-4.507,3.796-7.404,3.796-9.634
+ c0-5.973-4.638-12-15-12c-9.112,0-15,5.495-15,14c0,1.166,0.123,2.267,0.241,3.331c0.107,0.968,0.207,1.864,0.204,2.7
+ c-3.537,0.083-7.038,0.317-10.199,0.529c-3.374,0.226-6.562,0.439-9.246,0.439h-2.961l-0.039,2.989
+ c-0.035,2.644-1.656,17.011-8,17.011c-1.21,0-3-1.589-3-5c0-0.244-0.005-0.503-0.01-0.775c-0.057-2.933-0.117-5.966,6.116-8.436
+ l1.223-0.484l0.472-1.228c0.302-0.785,1.707-4.846-0.276-7.733c-0.608-0.886-2.06-2.371-4.945-2.371
+ c-0.274,0-0.561,0.014-0.851,0.04c-1.974,0.178-5.405,1.917-12.763,5.842c-2.98,1.59-7.018,3.744-8.235,4.145
+ c-1.546-0.011-2.731-0.216-2.731-3.999c0-3.57,2.432-4.528,8.404-6.008c4.894-1.212,11.596-2.872,11.596-9.992
+ c0-5.252-4.527-8-9-8c-2.002,0-4.647,0.626-7.205,1.231c-1.293,0.307-3.246,0.769-3.795,0.769c-5,0-5-2.906-5-4
+ c0-5.094,2.882-6,11-6c1.611,0,6.513,0,9.051,0.846l3.009,1.003l0.834-3.06C240.998,301.743,246,280.698,246,277
+ c0-9.572-2.776-16.579-5.461-23.355c-2.583-6.521-5.024-12.68-5.039-21.068c0.119-1.052,1.42-6.151,2.57-10.657
+ c1.876-7.352,4.206-16.483,5.351-22.711c0.392-1.379,0.328-3.073,0.248-5.188c-0.054-1.437-0.219-5.81,0.57-6.5c0,0,0,0,0.001,0
+ c0.011,0,0.1-0.021,0.261-0.021c0.299,0,0.854,0,1.528,1.008c3.675,5.502,2.161,25.852-5.299,43.842l-1.53,3.69l3.97,0.44
+ c11.498,1.277,20.363,5.538,27.101,13.025l1.855,2.061l2.2-1.687c14.329-10.985,26.298-25.655,34.612-42.423
+ c7.457-15.037,11.562-31.003,11.562-44.958c0-5.936,0-24-18-24c-15.847,0-37.457,7.883-54.821,14.218
+ c-1.838,0.67-3.611,1.317-5.304,1.927c-0.479-1.517-0.963-3.148-1.464-4.836C236.714,135.658,230.964,116.277,213.538,116.277
+ L213.538,116.277z"/>
+ </g>
+</g>
+<g>
+ <g>
+ <path fill-rule="evenodd" clip-rule="evenodd" fill="#FBE500" d="M240.5,158.5c-5-14-9-42-30-39c-6.17,1.17-9.5,24.83-13,45
+ c-5-6-24.83-39-45-39c-11.5,0-24,9-24,19v6c0,2,2,10,2,13c0,2-1,2-1,2c-14,2-32.9,5-45.5,6.5c-18.17,2.17-40.33,5-37.5,25.5
+ c2.74,19.811,12,56,41,66c15,6,49,5,56,3c3-6,8-17,8-19c0-4-14-11-14-29c0-4-1-10,6-13c3,4,5,7,6,11c2,5,4.5,16.17,13.5,21.17
+ c5,3,11.5,4.83,15.5,8.83c2,2,0,7,0,10c0.83,14.83,7,21,10,33c2,7,3,16,4,23v22c0,18,17,46,44,46c10,0,38-16,42-16
+ c12,0-1,16,15,16c7,0,12-5,12-13s-10-14-10-18c0-2,6-11,6-14c0-5-4-9-12-9c-6,0-12,3-12,11c0,3,1,6,0,9c-8,0-16,1-22,1
+ c0,1-1,20-11,20c-3,0-6-3-6-8c0-3-0.83-8.5,8-12c0,0,2.5-6.5-3-6c-2.99,0.27-19,10-21,10c-3,0-6-1-6-7c0-12,20-6,20-16
+ c0-4-4-5-6-5c-3,0-9,2-11,2c-3,0-8-1-8-7c0-9,8-9,14-9c2,0,7,0,10,1c3-11,7.5-30.5,7.5-33.5c0-18-10.5-26.5-10.5-44.5
+ c0-2,6-23,8-34c1-3-2-14,4-14c9.83,0,7.5,28.5-1,49c9,1,20,4,29,14c30-23,45-59,45-85c0-10-2-21-15-21
+ C284.5,141.5,257.5,152.5,240.5,158.5z"/>
+ </g>
+ <defs>
+ <filter id="My_OpacityMaskFilter_1_" filterUnits="userSpaceOnUse" x="46.254" y="119.277" width="271.246" height="261.223">
+
+ <feColorMatrix type="matrix" values="-1 0 0 0 1 0 -1 0 0 1 0 0 -1 0 1 0 0 0 1 0" color-interpolation-filters="sRGB" result="source"/>
+ </filter>
+ </defs>
+ <mask maskUnits="userSpaceOnUse" x="46.254" y="119.277" width="271.246" height="261.223" id="SVGID_2_">
+ <g filter="url(#My_OpacityMaskFilter_1_)">
+
+ <image overflow="visible" width="278" height="268" xlink:href="data:image/jpeg;base64,/9j/4AAQSkZJRgABAgEASABIAAD/7AARRHVja3kAAQAEAAAAHgAA/+4AIUFkb2JlAGTAAAAAAQMA
+EAMCAwYAAARTAAAJlwAADlr/2wCEABALCwsMCxAMDBAXDw0PFxsUEBAUGx8XFxcXFx8eFxoaGhoX
+Hh4jJSclIx4vLzMzLy9AQEBAQEBAQEBAQEBAQEABEQ8PERMRFRISFRQRFBEUGhQWFhQaJhoaHBoa
+JjAjHh4eHiMwKy4nJycuKzU1MDA1NUBAP0BAQEBAQEBAQEBAQP/CABEIAQwBFgMBIgACEQEDEQH/
+xACaAAEAAgMBAQAAAAAAAAAAAAAABgcDBAUBAgEBAAAAAAAAAAAAAAAAAAAAABAAAgICAQMEAgEE
+AwEAAAAAAgMBBAUGACARExAwQBIxFBWAITM0IjI1FhEAAgIBAQYFAgUEAwEAAAAAAQIAESEDIDFB
+URIiEDBAYXGRE4GxMlIjocFCYuFyMwQSAQAAAAAAAAAAAAAAAAAAAID/2gAMAwEAAhEDEQAAAK/A
+AAAAPs+Hf7BCEqjprgAzdPrTsp7WtOtjVAAAAAAAAAAB7N4nbRubf16YI/J/kpblXDWJzPr52iy5
+VyeuYa5suOlRMuIAPreOekfSIUm8eOSAAAAADcuCmLhO0AD5i8qxlGb8v5pYG3jyDT3Pkprj27rF
+ed+fbpGOz0fTBk+xjjUp5RTzeHHMhjd7tEH+rK3yrNi19oqres3KQSbbHoAAB8fOUeegB4D0AADl
+dXglatIY7DidrDZ+x49AAAAAAAADz35OBwNWGl65+F3QADyGS2ryLvB3bZpi3zpAAAAeOEdfNT1j
+nbeegAADFl0yt4r1eYWzI+B3wB57iORU0qhQB92vUs4LH9+PsAAA8gU9hJW0yhvQLsycnqnoAAHD
+7cMK6y6fcLQ6mlug8Ee6FYHK1QAdLmi7OnXc/MwAAHG7OMo7Un0DJfP6Q7RcnsQlRlAB81xZFekC
+6vKFmyaju0XFqRThn3EffkAAA2LIq/aLxywKVnSYsh689Hjw5VU2PVZhBktyobWJQ89APIxKNApD
+563JAPv4AAAAAD66fKEw6tdC0c1Uelq6la+EhjwALKrWUlre4cwA+PvwraE2ZWYAAAAAAAAAAAAA
+2tUXP2YNOD0Dz34IdWc2hIAAAAAAAAAAAAABK7Rp23DaeaxtamnxiG8HZ1gAAAAAAAAAAAAADoXD
+TtwGSrrGp0+vnD6eAAAAAAAAAAAAAAA37gp63jfiMy4RCND65Bh8ABlxSYxa9p8Qq/zPgAAAAAAA
+AAAMtsVFNiya9n3GKd+5Z0iFa3Y4g++hPitpvKugZIHPa6IMAAAAAAAAAABt6gtuR0tY5IdfL9lP
+8KyYodGw4VjJxrVZoF687hSMqXky2JAAAAAAAAAAADb1BM+3WP0T+O8L5NrVADu9+B/Rv84AP//a
+AAgBAgABBQD+jL//2gAIAQMAAQUA/oy//9oACAEBAAEFAPiVqrLJ/wDzlmRtULFWfjqUxx0dWsP4
+GmB9bunmuLdGxULo1TF+QVYlfjzWBWasjSOnY+KAyZa1r49quOUoIUuONqKZGY15Tgy2EfRZ6LH7
+HqtSAREdosKhq9wxfaPi4oYO9gkCKfUhgozOHW9eZxTaL+YxXlu4JP0r+my0oaiyrw2PUFsZKMJf
+fyvp9lnE6SMcdpixHJ4N1L3MSUDfwhRNfoMYMdiwgWFX6TKT9ZT5chjl/RHpkUeVGz05rXhAjmrg
+r1maGlSXKOqIVCMPXXAVEhyFBHDSso2HHBKf14/kPaqlIWNdkpq9LlC0Nn1ybAahhLiXpD6L9CGC
+jL6xXyBVNQrJmviEJgErDqzYxKCGP5/phbJ4NG2fF4LIslWq3jlGlOKcfo6QZSqDWV1GsGQuupc+
+7my7VyKP5/ia7nlS1W0/lbSA7I02uMK1auPF6/WHgYmuPBooHgoUPIEY97v25BDPsbG6Ar+aP5Kn
+VK0/A68sARj0qGFhHO0fE2HPDjk4fdP2rFWwL1dMz2jb7sAj7T9tVUJ2scoQT8U57DvbJkaxkuxr
+b5ZW6bTIWrcL3kZzVGwFygX2R7JFAx+2n7RMFHsvL6q3V4kxX+TV/wDW6c9eFKcnZmzb5hH+G/h3
+Qyv7Ow5T9NC9rvxcwWVG2n2ck3xo2Sz5r6Bk360uRrdFhsKXt+W/t6JOVt1e3DEexP43k5/X5peR
+IeJODX7Gw2IXXut81rEpl1/CK+lf1mYiNgyoIVkbhW7PrpeQ/wCCjgw65/G61SOvzC3Jq3cNdFye
+ufxuVvx15mZnV0fa3jfrCfXKZAK6tkzJWndGDvTUuYe6L0+xnqUWK+TqFUtxMxOs7DAcpZNTwgoK
+Ok/+u9sKB5iMkunOJ2ZBRWySXRBhMXb60hs+fI5mZKeiJmJ1PN9xruFodblwwNswXkgwJZCZAWN2
+W1UnC7SmzCXC4Ogv7jvNeSV6Aw1ljdmtVSr7OJqzWzkcMYbD6qVtlR+vZ8HLS4Gj15pYSrOisbfo
+h7a7NXtm+r07VT8tdgStnqDmBEzMz7FDIOpMwm1LZFXLJbAvWfIKJ6CKBjYsgIJuPl9j0X/k1WYi
+v05WvDUbFTmtd94DMCp7BdrTU3SR5X3RBcHca3A22sUM22uPH7fXkc7nf2o9YntOn24NET3joaP2
+XulKIH4cEQ8kiLr06/421WQxXRP43Bcfr/LxtqatvA3IfX6J/G4tiK/zNLvSxET3j1YX1Dd7UyPz
+NKsyLUF9let90LTtVry2/mas2V36B/ZH44++hPGZ6vHMrnFmvIv89v5mDKRyOJnvXyVr9dGc2S06
+zN+5PJt2S5M95+Zhf/Qw/wDr7Aozq21GqzztPzsL/wChh/8AXekXBmdarNJmDrom3WSIlEQXRXrs
+sMRq7DC7r7a8EMjPxMPPa/hSia/M/fVWXkdg8putub1alUFxV8cEKzyFrXckZs/ErM8VjWrcMRP4
+302Qri1MZMUCGGiIl2meCppTFC4XNIxtha+31XueQ8ITMzPxdPyv9kMhi8/hAyCo0ZgtXra6q86f
+gZ+eYOn+zYx+upIVYGsPEVVIg47ju+Naz4+NulTs4DMLeoSEx8YcuVxJO2IJd/mp0pCKrVLW7K11
+cDYKpGl4OHMUQerP4/8AUs/GwuZOgzD59TwVYWyD+shs2GVchWBhTatlVQLm1Aobuw3LMjcsizVs
+wTq9myBK2wgkfj0sjZpljdwiIXtaTG9sKCG3nQmX5Cw7kzM+uCysVodsQeLLZGbjPkj5OF5OqO/e
+fJ29f//aAAgBAgIGPwAZf//aAAgBAwIGPwAZf//aAAgBAQEGPwD0nQg+TOoE/SfyLjn6gJpi2MB1
+Lo8BMpmE6dgzp1Vxz2RqMMtmCxG7Y2mR232+mCLvJoRXZbY5JMGJulERqUG4zAE6d/TxVeZAiY4C
+VCCI2qq5XPptMGKa4bFGN23cY1/GT9PDSX3uL8eL43iPp/tONikUsfYQUnSDzgLk+4EtgT8w0kLL
+ZUbx5mmTzqL8bJBjdt3G0mBr/EwGr6azF+PFh7QtVB5SgseQgpOkHnAdW2+YOwfSDtEws3SiIxrh
+PsVjrqvL02G8MIhPLaKkRm017t4qM/8A9Gn0d2PwgXxIPGXqIGo2IKQCvaDtEwNpviIP9v7HawhP
+4GDp0mz7QD7dA8Z3YHsJ3kmKzr1UQRed0CDgNumFy1WvOb4iHh1f2Ph06SljAdSwOQnepPzAPtjH
+tB2D6T9In6RP0iYWYHn4PkN8T7vD7n/EXSXjvikrBgTA9Kz3u4T7epaEnAPGBhtEx88DOrjdw3zE
+FDh6Yyv9h+c03XeGES+W0TPtA7znwKnjRi/HlWTQnT1C5Yz5TGBOJMT/ALD84nwNps1iO92AaHgh
+ug2Ivx5TMDVCfcZv4i27kIpu7HlN8Qi7CzTUbywiXy2SxjaaNlsDxRx/iQYmeA8kxxw8Bosf0moD
+5LZ4TUe7tjU0l5G4vxsWY3dVCNqE2t9uwumxyuICPJ1K5HwVrpWwYueHkvngZZ3mfcO4YEAHLYOa
+jaKHHE7K5pWOfmLnh5LCrsR9MigSSssbxF0tRqYc4O4Swb2jKB3nPgOrHvAvWPrBTCXcOYdLSbuM
+JJsnedmxvG6Lps3cuDAQfIKmNqIveMgwo4phvEDIaYbiIBqEso4iKOsXygZTsmM37Tf08epGKnmI
+q6p6l5wHq4RtPSa2MLubY7ztrqIaF9wijqgIPkNfKHp35vxGppMVYHhxiF95A2nxwMZDvUkbBCsQ
+DwlnJ8kOhPTxWBWajxBg7hMGYOxZMbPCPqHiceK/I/OIByG02OELcH/Pz+pCVPMTJ6hANQlT7yi4
++s/9B9Zhx9Zlx9YQNQfWFNNrvYsbxEzeBAdkiM4GVN+kwSPiZJPzt/ZY7jj4gO059j6xNQbrAMXO
+8bTj2PrUBOaowHYJhQcTXrTp8AfzinYOeECXus+tq8Govx4dzCYYRgrR3969bp1F+Ize0fT0WpVN
+EzOs07tQmWfW6cX4jheU1EcUwY/1Phu9dpxfiFWhcoLhpRCMQgbtkJpizxMtruFlvHAwqcEb/S6Z
+i/HgzMaqEaORz4TuOOW11EWbgxwjYj9O6/S6b8iImeHgQDQJAP18KQXL1Me0oTEpUJJ9pjRY/hOr
+WQoSTgz4EZQe44Es7z6ZdNjlcGAiMpF3MsxS90wtVPtJgnwyLAxASggtRKQVCJ91QT0G69OuoD23
+3Re67EsZE3RqHCAkdpsX4DUcUWNwXMsJ0dYuWpuNYuxCyilY59OFY/x3v5Re4G5YMIuHnvBEvUPU
+BwMAsCoQrWeQhCsUX+sGqNVuoG95iFzmsw54Rq3+oB02PT+2BdRuk+8/WPrCeoQ/byfaV1dI9pZy
+fEIxqp+rhKBtR6rsv8Lndde97WN8zde97H//2Q==" transform="matrix(1 0 0 1 43 116)">
+ </image>
+ </g>
+ </mask>
+ <g mask="url(#SVGID_2_)">
+ <path fill-rule="evenodd" clip-rule="evenodd" fill="#CEBC01" d="M240.5,158.5c-5-14-9-42-30-39c-6.17,1.17-9.5,24.83-13,45
+ c-5-6-24.83-39-45-39c-11.5,0-24,9-24,19v6c0,2,2,10,2,13c0,2-1,2-1,2c-14,2-32.9,5-45.5,6.5c-18.17,2.17-40.33,5-37.5,25.5
+ c2.74,19.811,12,56,41,66c15,6,49,5,56,3c3-6,8-17,8-19c0-4-14-11-14-29c0-4-1-10,6-13c3,4,5,7,6,11c2,5,4.5,16.17,13.5,21.17
+ c5,3,11.5,4.83,15.5,8.83c2,2,0,7,0,10c0.83,14.83,7,21,10,33c2,7,3,16,4,23v22c0,18,17,46,44,46c10,0,38-16,42-16
+ c12,0-1,16,15,16c7,0,12-5,12-13s-10-14-10-18c0-2,6-11,6-14c0-5-4-9-12-9c-6,0-12,3-12,11c0,3,1,6,0,9c-8,0-16,1-22,1
+ c0,1-1,20-11,20c-3,0-6-3-6-8c0-3-0.83-8.5,8-12c0,0,2.5-6.5-3-6c-2.99,0.27-19,10-21,10c-3,0-6-1-6-7c0-12,20-6,20-16
+ c0-4-4-5-6-5c-3,0-9,2-11,2c-3,0-8-1-8-7c0-9,8-9,14-9c2,0,7,0,10,1c3-11,7.5-30.5,7.5-33.5c0-18-10.5-26.5-10.5-44.5
+ c0-2,6-23,8-34c1-3-2-14,4-14c9.83,0,7.5,28.5-1,49c9,1,20,4,29,14c30-23,45-59,45-85c0-10-2-21-15-21
+ C284.5,141.5,257.5,152.5,240.5,158.5z"/>
+ </g>
+</g>
+<path fill-rule="evenodd" clip-rule="evenodd" fill="#FFFFCC" d="M168.67,263.33c-3.67-2-6.67-3.33-9-6.33
+ c-5,9-11.17,30.5-11.17,41.5c0,3,1,10,2,15C177.67,289,168.67,263.33,168.67,263.33z"/>
+<g>
+ <path fill-rule="evenodd" clip-rule="evenodd" fill="#FF6600" d="M193.772,206.837c-5.358,0-10.236-2.729-13.736-7.683l-0.198-0.28
+ l-0.093-0.33c-8.547-30.246-25.982-48.151-39.992-62.539c-2.949-3.03-5.736-5.89-8.24-8.667l-0.94-1.043l0.662-1.238
+ c3.588-6.719,10.431-10.272,19.783-10.272c5.169,0,10.029,1.066,13.196,1.96c2.665,0.75,5.5,1.129,8.429,1.129
+ c0.004,0,0.006,0,0.01,0c7.256,0,14.981-2.283,22.334-6.601c2.978-1.746,6.236-2.632,9.686-2.632
+ c6.564,0,11.543,3.219,11.753,3.357l1.181,0.775l-0.336,1.373c-4.887,19.923-7.7,46.495-8.604,81.235l-0.006,0.27l-0.078,0.255
+ C206.643,202.342,200.553,206.835,193.772,206.837L193.772,206.837z"/>
+ <path fill="#917013" d="M204.676,110.643c6.042,0,10.654,3.027,10.654,3.027c-4.33,17.66-7.66,43.26-8.66,81.66
+ c-1.729,5.729-7.115,9.506-12.899,9.506c-4.249,0-8.713-2.037-12.101-6.836c-10.51-37.2-34.41-56.19-48.67-72
+ c3.897-7.297,11.292-9.214,18.019-9.214c5.322,0,10.226,1.199,12.651,1.884c2.928,0.824,5.941,1.206,8.975,1.206
+ c8.011,0,16.174-2.662,23.355-6.876C198.988,111.248,201.975,110.643,204.676,110.643 M204.677,106.643
+ C204.677,106.643,204.677,106.643,204.677,106.643c-3.812,0-7.412,0.979-10.701,2.907c-7.053,4.139-14.428,6.327-21.332,6.327
+ c-2.745,0-5.4-0.355-7.892-1.057c-3.285-0.927-8.337-2.033-13.734-2.033c-10.138,0-17.589,3.917-21.547,11.33l-1.323,2.478
+ l1.881,2.086c2.528,2.803,5.326,5.676,8.289,8.718c13.853,14.225,31.094,31.929,39.502,61.69l0.187,0.659l0.396,0.561
+ c3.883,5.5,9.342,8.528,15.369,8.528c7.655,0,14.534-5.078,16.729-12.35l0.155-0.515l0.014-0.537
+ c0.889-34.117,3.764-61.306,8.546-80.812l0.673-2.746l-2.363-1.551C217.296,110.176,211.832,106.643,204.677,106.643
+ L204.677,106.643z"/>
+</g>
+<g>
+ <g>
+ <path fill-rule="evenodd" clip-rule="evenodd" fill="#FF6600" d="M215.33,113.67c-4.33,17.66-7.66,43.26-8.66,81.66
+ c-3,9.939-17,14-25,2.67c-10.51-37.2-34.41-56.19-48.67-72c6.98-13.07,25.18-8.88,30.67-7.33c10.66,3,22.43,0.14,32.33-5.67
+ C205.67,107.33,215.33,113.67,215.33,113.67z"/>
+ </g>
+ <defs>
+ <filter id="My_OpacityMaskFilter_2_" filterUnits="userSpaceOnUse" x="133" y="110.643" width="82.33" height="94.193">
+
+ <feColorMatrix type="matrix" values="-1 0 0 0 1 0 -1 0 0 1 0 0 -1 0 1 0 0 0 1 0" color-interpolation-filters="sRGB" result="source"/>
+ </filter>
+ </defs>
+ <mask maskUnits="userSpaceOnUse" x="133" y="110.643" width="82.33" height="94.193" id="SVGID_3_">
+ <g filter="url(#My_OpacityMaskFilter_2_)">
+
+ <image overflow="visible" width="87" height="99" xlink:href="data:image/jpeg;base64,/9j/4AAQSkZJRgABAgEASABIAAD/7AARRHVja3kAAQAEAAAAHgAA/+4AIUFkb2JlAGTAAAAAAQMA
+EAMCAwYAAAIPAAADBQAAA/v/2wCEABALCwsMCxAMDBAXDw0PFxsUEBAUGx8XFxcXFx8eFxoaGhoX
+Hh4jJSclIx4vLzMzLy9AQEBAQEBAQEBAQEBAQEABEQ8PERMRFRISFRQRFBEUGhQWFhQaJhoaHBoa
+JjAjHh4eHiMwKy4nJycuKzU1MDA1NUBAP0BAQEBAQEBAQEBAQP/CABEIAGMAVwMBIgACEQEDEQH/
+xACPAAEAAgMBAQAAAAAAAAAAAAAABgcCAwUBBAEBAAAAAAAAAAAAAAAAAAAAABAAAQQBAwMDBQEA
+AAAAAAAAAwECBAYFABAgETESUCETMDIjMxQ0EQACAQEGAwgDAQAAAAAAAAABAgARECAhMUEDcRIi
+MFFhgZGhMkJigrITEgEAAAAAAAAAAAAAAAAAAABQ/9oADAMBAAIRAxEAAACv2ySEXWJ8xBEowI1n
+MZGQLbaXOKmfaNVkVRIS3Ped0jW2jDL0OH24uVm+YYgk1lUhMSzffm+kA8hE2rwggAGeAsia0lbB
+2HnphWlk1YRcAACawr7i7tnJ6xpqi1anI+AAACxJvS0zJXU0ihhpAAAA2BjiAH//2gAIAQIAAQUA
+9K//2gAIAQMAAQUA9K//2gAIAQEAAQUA5iCUzolalGSTWXiaSK8ZwAed+Oq7TIyoBVkmkjVCUuQj
+kpkpVh0j3gVUAdCxYRtzEQYxS3IuZxUhgj4MgSNY1nirGLpY4l1/MLSDY3exERkd5PLJ6r+efGLi
+8kOSPlbDeEfz/JtWs+QBMdPZIHwXtdJHhH3RVatWsDmrEktOPd/23cifFwCV4SVTOIcY3o9uxPZl
+4d15YbIOhSsJkGyA7SF6CuhXKflTcu7QSIQepX6bj/q5YeUsWbhJaGBqYvQFtIjpnJFVFqOU8gjM
+x7clIY0Nkej5/PEZR0EsWzj+PKWZijlSHSDfQH2J32//2gAIAQICBj8AK//aAAgBAwIGPwAr/9oA
+CAEBAQY/AL/LtqWPhAz1A7hKioMXZObMFHmaQInmYC45ie+U5B6Q8q0PhDysaT5H0gO6C3GDoA8p
+QARjTSbQ0G4n9CAPqc4tKQUExE+M+MwFrcINyuH+qmvAixdrdbDQwY1rffgZz/lze9bRs7rYaEwY
+1umPwNwMpoRkYuzut1CAg3DGBOeF1dxDRlNYqserIiBhraZT8heU16GIBi41qLWgXQm+Nl26lwgY
+WNF4m+jaMaGLjpY0C61JvgjMZRAxxgNYwrpCR49gAT0EwdfvCA2cbcbXLsfv+s+37W//2Q==" transform="matrix(1 0 0 1 131 108)">
+ </image>
+ </g>
+ </mask>
+ <g opacity="0.6" mask="url(#SVGID_3_)">
+ <path fill-rule="evenodd" clip-rule="evenodd" fill="#7F3E03" d="M215.33,113.67c-4.33,17.66-7.66,43.26-8.66,81.66
+ c-3,9.939-17,14-25,2.67c-10.51-37.2-34.41-56.19-48.67-72c6.98-13.07,25.18-8.88,30.67-7.33c10.66,3,22.43,0.14,32.33-5.67
+ C205.67,107.33,215.33,113.67,215.33,113.67z"/>
+ </g>
+</g>
+<path opacity="0.25" fill-rule="evenodd" clip-rule="evenodd" fill="#FFFFFF" d="M210.936,113.796
+ c-11.983,64.227-22.738,60.791-73.726,11.721c0.148-11.045,22.734-5.193,27.431-4c9.14,2.331,19.844,0.864,27.954-4.462
+ C202.85,110.315,210.936,113.796,210.936,113.796z"/>
+<path fill-rule="evenodd" clip-rule="evenodd" fill="#FFFFCC" d="M281.5,290.5c-7.17-37.17-37.17-42.83-37.17-42.83
+ c3,10,6.34,19.33,9.17,27.83C261,282,273.5,289.5,281.5,290.5z"/>
+<path fill-rule="evenodd" clip-rule="evenodd" fill="#FFFFCC" d="M168.67,263.33c-3.67-2-6.67-3.33-9-6.33
+ c-5,9-11.17,30.5-11.17,41.5c0,3,1,10,2,15C177.67,289,168.67,263.33,168.67,263.33z"/>
+<path fill-rule="evenodd" clip-rule="evenodd" fill="#FFFFCC" d="M281.5,290.5c-7.17-37.17-37.17-42.83-37.17-42.83
+ c3,10,6.34,19.33,9.17,27.83C261,282,273.5,289.5,281.5,290.5z"/>
+<path fill-rule="evenodd" clip-rule="evenodd" d="M166.77,188.01c5.25,0.61,8.37,11.49,9.67,19.44c1.33,8.17,1.33,16.76-4.05,17.47
+ c-8.06,1.08-11.67-21.93-11.67-21.93C158.28,187.29,166.77,188.01,166.77,188.01z"/>
+<path fill-rule="evenodd" clip-rule="evenodd" d="M229.86,192.56c0.99,10.209-3.431,23.959-6.57,24.39
+ c-6.29,0.85-7.51-9.05-7.72-10.7c-0.41-3.3-3.061-24.76,7.939-26.25C228.33,182,229.45,189.26,229.86,192.56z"/>
+<path opacity="0.1" fill-rule="evenodd" clip-rule="evenodd" fill="#FFFFFF" d="M216.51,195.85c0.93-8.26,11.79-5.08,11.79,2.86
+ c0,7.95-2.1,14.261-4.34,16.21C217.75,220.32,215.58,204.12,216.51,195.85z"/>
+<path opacity="0.1" fill-rule="evenodd" clip-rule="evenodd" fill="#FFFFFF" d="M163.09,206.33c-1.19-8.13,9.59-8.43,11.57-0.891
+ c1.97,7.551,1.6,14.181,0.02,16.721C170.3,229.18,164.28,214.45,163.09,206.33z"/>
+<rect x="701" y="306" fill-rule="evenodd" clip-rule="evenodd" fill="#FBE500" stroke="#1F1F1F" stroke-width="20" stroke-linecap="round" stroke-linejoin="round" width="14" height="34"/>
+<circle fill-rule="evenodd" clip-rule="evenodd" fill="#FFFF33" cx="182.5" cy="139.5" r="11.5"/>
+<g>
+ <g>
+ <path fill-rule="evenodd" clip-rule="evenodd" fill="#23A9FF" d="M149.33,127.79c0,14.21-17,14.21-17,14.21
+ c-14.16,0-14.9-11.46-14.16-14.21c2.16-8.12,3.83-13.12,15.16-13.12C139,114.67,149.33,119.26,149.33,127.79z"/>
+ <path fill="none" stroke="#000000" stroke-width="3" stroke-linecap="round" stroke-linejoin="round" d="M149.33,127.79
+ c0,14.21-17,14.21-17,14.21c-14.16,0-14.9-11.46-14.16-14.21c2.16-8.12,3.83-13.12,15.16-13.12
+ C139,114.67,149.33,119.26,149.33,127.79z"/>
+ </g>
+ <defs>
+ <filter id="My_OpacityMaskFilter_3_" filterUnits="userSpaceOnUse" x="116.477" y="113.17" width="34.353" height="30.33">
+
+ <feColorMatrix type="matrix" values="-1 0 0 0 1 0 -1 0 0 1 0 0 -1 0 1 0 0 0 1 0" color-interpolation-filters="sRGB" result="source"/>
+ </filter>
+ </defs>
+ <mask maskUnits="userSpaceOnUse" x="116.477" y="113.17" width="34.353" height="30.33" id="SVGID_4_">
+ <g filter="url(#My_OpacityMaskFilter_3_)">
+
+ <image overflow="visible" width="39" height="35" xlink:href="data:image/jpeg;base64,/9j/4AAQSkZJRgABAgEASABIAAD/7AARRHVja3kAAQAEAAAAHgAA/+4AIUFkb2JlAGTAAAAAAQMA
+EAMCAwYAAAGnAAAB+QAAAmr/2wCEABALCwsMCxAMDBAXDw0PFxsUEBAUGx8XFxcXFx8eFxoaGhoX
+Hh4jJSclIx4vLzMzLy9AQEBAQEBAQEBAQEBAQEABEQ8PERMRFRISFRQRFBEUGhQWFhQaJhoaHBoa
+JjAjHh4eHiMwKy4nJycuKzU1MDA1NUBAP0BAQEBAQEBAQEBAQP/CABEIACMAJwMBIgACEQEDEQH/
+xAB9AAEAAgMBAAAAAAAAAAAAAAAABgcBBAUDAQEAAAAAAAAAAAAAAAAAAAAAEAACAwEAAwEBAAAA
+AAAAAAADBAECBQYQMBEAExEBAAIBAwMDBQAAAAAAAAAAAQACETFBAxBxEiGBkcEiMhMEEgEAAAAA
+AAAAAAAAAAAAAAAw/9oADAMBAAIRAxEAAACAdvxtYgHEurklMuyNm1aPm5YOlHo4aqPjzBnAAf/a
+AAgBAgABBQD0/wD/2gAIAQMAAQUA9P8A/9oACAEBAAEFAIibTncyy3BOKvFH8NxOfk/edThlzMzx
+CDIRzGvlhIJ7PgO1yJKUZSJW4f2kwMYdRql91Nu6h8rrhQMnYLRXY67+1bHJY/ifP//aAAgBAgIG
+PwAf/9oACAEDAgY/AB//2gAIAQEBBj8AAMroQtfIOxM1yMVq2qb7zG8GxkrKvjtMeJLPiaTg4g+3
+l5aVx3sER1zK4elhdp/JjSvPxq9rkOWm2pAvfCajPzPmWpwvks/eubli3uevU+vX/9k=" transform="matrix(1 0 0 1 114 111)">
+ </image>
+ </g>
+ </mask>
+ <g mask="url(#SVGID_4_)">
+ <path fill-rule="evenodd" clip-rule="evenodd" fill="#043C96" d="M149.33,127.79c0,14.21-17,14.21-17,14.21
+ c-14.16,0-14.9-11.46-14.16-14.21c2.16-8.12,3.83-13.12,15.16-13.12C139,114.67,149.33,119.26,149.33,127.79z"/>
+ <path fill="none" stroke="#043C96" stroke-width="3" stroke-linecap="round" stroke-linejoin="round" d="M149.33,127.79
+ c0,14.21-17,14.21-17,14.21c-14.16,0-14.9-11.46-14.16-14.21c2.16-8.12,3.83-13.12,15.16-13.12
+ C139,114.67,149.33,119.26,149.33,127.79z"/>
+ </g>
+</g>
+<g>
+ <g>
+ <path fill-rule="evenodd" clip-rule="evenodd" fill="#23A9FF" d="M230.33,111.33c3,4.84,4.68,17.12-15.33,16.17
+ c-7-0.33-11.35-13.81-7.33-17.83C211.67,103,226,104.33,230.33,111.33z"/>
+ <path fill="none" stroke="#000000" stroke-width="3" stroke-linecap="round" stroke-linejoin="round" d="M230.33,111.33
+ c3,4.84,4.68,17.12-15.33,16.17c-7-0.33-11.35-13.81-7.33-17.83C211.67,103,226,104.33,230.33,111.33z"/>
+ </g>
+ <defs>
+ <filter id="My_OpacityMaskFilter_4_" filterUnits="userSpaceOnUse" x="204.631" y="103.813" width="29.007" height="25.239">
+
+ <feColorMatrix type="matrix" values="-1 0 0 0 1 0 -1 0 0 1 0 0 -1 0 1 0 0 0 1 0" color-interpolation-filters="sRGB" result="source"/>
+ </filter>
+ </defs>
+ <mask maskUnits="userSpaceOnUse" x="204.631" y="103.813" width="29.007" height="25.239" id="SVGID_5_">
+ <g filter="url(#My_OpacityMaskFilter_4_)">
+
+ <image overflow="visible" width="34" height="31" xlink:href="data:image/jpeg;base64,/9j/4AAQSkZJRgABAgEASABIAAD/7AARRHVja3kAAQAEAAAAHgAA/+4AIUFkb2JlAGTAAAAAAQMA
+EAMCAwYAAAGWAAAB3QAAAkb/2wCEABALCwsMCxAMDBAXDw0PFxsUEBAUGx8XFxcXFx8eFxoaGhoX
+Hh4jJSclIx4vLzMzLy9AQEBAQEBAQEBAQEBAQEABEQ8PERMRFRISFRQRFBEUGhQWFhQaJhoaHBoa
+JjAjHh4eHiMwKy4nJycuKzU1MDA1NUBAP0BAQEBAQEBAQEBAQP/CABEIAB8AIgMBIgACEQEDEQH/
+xAB4AAADAQEAAAAAAAAAAAAAAAAABQcGAwEBAAAAAAAAAAAAAAAAAAAAABAAAgIDAQEAAAAAAAAA
+AAAAAgMEBQABBiASEQACAQMDAwUAAAAAAAAAAAABAgAREgMQITFRsQRBcdEiYhIBAAAAAAAAAAAA
+AAAAAAAAIP/aAAwDAQACEQMRAAAAwTkqRLU1vnZkQBrUoy5KrPV6Y5gH/9oACAECAAEFAPX/2gAI
+AQMAAQUA9f/aAAgBAQABBQBSjccbl5Tgk8tMSLksSecugGya+CnSpUBJr6ysBesoJuosystUkmVa
+IBfU2i2awfr6iTrxYSLC/MH7cR5//9oACAECAgY/AF//2gAIAQMCBj8AX//aAAgBAQEGPwAJjFWM
+DEkE9BLlNfcQpkFrDQ3DgiA0h2EbIg+y76C40Dd4tWHENGEZFNSdhoLa3elOYBi8fK46hGPYSj+P
+mQdTjf4hOe6/9Cmn/9k=" transform="matrix(1 0 0 1 202 101)">
+ </image>
+ </g>
+ </mask>
+ <g mask="url(#SVGID_5_)">
+ <path fill-rule="evenodd" clip-rule="evenodd" fill="#043C96" d="M230.33,111.33c3,4.84,4.68,17.12-15.33,16.17
+ c-7-0.33-11.35-13.81-7.33-17.83C211.67,103,226,104.33,230.33,111.33z"/>
+ <path fill="none" stroke="#043C96" stroke-width="3" stroke-linecap="round" stroke-linejoin="round" d="M230.33,111.33
+ c3,4.84,4.68,17.12-15.33,16.17c-7-0.33-11.35-13.81-7.33-17.83C211.67,103,226,104.33,230.33,111.33z"/>
+ </g>
+</g>
+<path opacity="0.25" fill-rule="evenodd" clip-rule="evenodd" fill="#FFFFFF" d="M116,85c4-22.67,16.33-29.33,23.67-27.67
+ c7.33,1.67,20,11,30,11c12.33,0,16.66-3,23.66-8.66c7-5.67,10.31,2.33,10,12.33C203,83,207,91.67,204,92s-10.67-18-19-11
+ c-5.33,10.67-2,25.67-12.33,27c-6.7,0.86-21.67-3.67-35-19c-3.07-3.52-12-6-15,1c-3.33,7.75-3.34,4.67-5,8
+ C116.61,100.11,114.86,91.45,116,85z"/>
+<g>
+ <g>
+ <circle fill-rule="evenodd" clip-rule="evenodd" fill="#23A9FF" cx="169" cy="29" r="26"/>
+ <circle fill="none" stroke="#000000" stroke-width="3" stroke-linecap="round" stroke-linejoin="round" cx="169" cy="29" r="26"/>
+ </g>
+ <defs>
+ <filter id="My_OpacityMaskFilter_5_" filterUnits="userSpaceOnUse" x="141.5" y="1.5" width="55" height="55">
+
+ <feColorMatrix type="matrix" values="-1 0 0 0 1 0 -1 0 0 1 0 0 -1 0 1 0 0 0 1 0" color-interpolation-filters="sRGB" result="source"/>
+ </filter>
+ </defs>
+ <mask maskUnits="userSpaceOnUse" x="141.5" y="1.5" width="55" height="55" id="SVGID_6_">
+ <g filter="url(#My_OpacityMaskFilter_5_)">
+
+ <image overflow="visible" width="60" height="60" xlink:href="data:image/jpeg;base64,/9j/4AAQSkZJRgABAgEASABIAAD/7AARRHVja3kAAQAEAAAAHgAA/+4AIUFkb2JlAGTAAAAAAQMA
+EAMCAwYAAAHLAAACZwAAAyD/2wCEABALCwsMCxAMDBAXDw0PFxsUEBAUGx8XFxcXFx8eFxoaGhoX
+Hh4jJSclIx4vLzMzLy9AQEBAQEBAQEBAQEBAQEABEQ8PERMRFRISFRQRFBEUGhQWFhQaJhoaHBoa
+JjAjHh4eHiMwKy4nJycuKzU1MDA1NUBAP0BAQEBAQEBAQEBAQP/CABEIADwAPAMBIgACEQEDEQH/
+xACFAAACAwEBAQAAAAAAAAAAAAAABwIFBgQBAwEBAAAAAAAAAAAAAAAAAAAAABAAAQQBBAMBAAAA
+AAAAAAAAAgEDBAYFABARFCBAExIRAAEDAgQFBAMAAAAAAAAAAAEAEQJBEiAhMQMQUXGRImGhwWKx
+MhMSAQAAAAAAAAAAAAAAAAAAAED/2gAMAwEAAhEDEQAAAF/6bAorJk9gpKZ5Z8UxYV5aNtbNU+no
+BGQYVdN9TFy2Ua0TUEZB4cpQqvS5cO7hBi3ag+w0chmYEogf/9oACAECAAEFAPQ//9oACAEDAAEF
+APQ//9oACAEBAAEFANIiksKvzpWhpcpUkVGY0MmFIilsiKS1qtfXUPFMMAjDSaciMuJmq4xIby+M
+PHyNV+F2p2KhgwxuYoQ3HFibPC80sUWUwnDXhZwRY34XuVGQLUyI4jjPha5YhH/afaFJKLIrmbbf
+ZAxNNps1thu15rsObY3KyIDmKuDJiNnjKMq2RwHM2w5GnDNw9055HucH9uN//9oACAECAgY/AAf/
+2gAIAQMCBj8AB//aAAgBAQEGPwBAAOToEDbbE909x7ImJJPqFbvQI9acQAHJ0Cjvb0Xkc86IC0L9
+QmMQpeALoxY2HQ8uEXDxj+VFhTAQaqcgMxmFbXRlJ+YUemGfRW/f5RiTmSCokcsMw9Cr6XXe7qG9
+Ghz6KHlqE8S/EknNS2ISd9enEGBeD5hASmx5FPeESJjujDYLvWiM5l5HU4PHWjI2/wBGrqvO5vs/
+zg//2Q==" transform="matrix(1 0 0 1 139 -1)">
+ </image>
+ </g>
+ </mask>
+ <g mask="url(#SVGID_6_)">
+ <circle fill-rule="evenodd" clip-rule="evenodd" fill="#043C96" cx="169" cy="29" r="26"/>
+ <circle fill="none" stroke="#043C96" stroke-width="3" stroke-linecap="round" stroke-linejoin="round" cx="169" cy="29" r="26"/>
+ </g>
+</g>
+<path opacity="0.25" fill-rule="evenodd" clip-rule="evenodd" fill="#FFFFFF" d="M149,22.33c13.33-26.66,39.67-9,40.67,3.34
+ C190.67,38,141.58,37.17,149,22.33z"/>
+</svg>

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/assembly/job.xml
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/assembly/job.xml b/community/mahout-mr/mr/src/main/assembly/job.xml
new file mode 100644
index 0000000..2bdb3ce
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/assembly/job.xml
@@ -0,0 +1,61 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<assembly
+ xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0"
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0
+ http://maven.apache.org/xsd/assembly-1.1.0.xsd">
+ <id>job</id>
+ <formats>
+ <format>jar</format>
+ </formats>
+ <includeBaseDirectory>false</includeBaseDirectory>
+ <dependencySets>
+ <dependencySet>
+ <unpack>true</unpack>
+ <unpackOptions>
+ <!-- MAHOUT-1126 -->
+ <excludes>
+ <exclude>META-INF/LICENSE</exclude>
+ </excludes>
+ </unpackOptions>
+ <scope>runtime</scope>
+ <outputDirectory>/</outputDirectory>
+ <useTransitiveFiltering>true</useTransitiveFiltering>
+ <excludes>
+ <exclude>org.apache.hadoop:hadoop-core</exclude>
+ </excludes>
+ </dependencySet>
+ </dependencySets>
+ <fileSets>
+ <fileSet>
+ <directory>${basedir}/target/classes</directory>
+ <outputDirectory>/</outputDirectory>
+ <excludes>
+ <exclude>*.jar</exclude>
+ </excludes>
+ </fileSet>
+ <fileSet>
+ <directory>${basedir}/target/classes</directory>
+ <outputDirectory>/</outputDirectory>
+ <includes>
+ <include>driver.classes.default.props</include>
+ </includes>
+ </fileSet>
+ </fileSets>
+</assembly>

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/assembly/src.xml
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/assembly/src.xml b/community/mahout-mr/mr/src/main/assembly/src.xml
new file mode 100644
index 0000000..0bb8e8b
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/assembly/src.xml
@@ -0,0 +1,64 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<assembly xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0"
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0 http://maven.apache.org/xsd/assembly-1.1.0.xsd">
+ <id>src</id>
+ <formats>
+ <format>dir</format>
+ <format>tar.gz</format>
+ </formats>
+ <fileSets>
+ <fileSet>
+ <directory>${project.basedir}/..</directory>
+ <outputDirectory/>
+ <useDefaultExcludes>true</useDefaultExcludes>
+ <includes>
+ <include>**/README*</include>
+ <include>**/LICENSE*</include>
+ <include>**/NOTICE*</include>
+ <include>**/pom.xml</include>
+ <include>**/src/**</include>
+ <include>src/conf/**</include>
+ <include>**/build.xml</include>
+ <include>**/*.properties</include>
+ </includes>
+ <excludes>
+ <exclude>**/target/**</exclude>
+ </excludes>
+ </fileSet>
+ <fileSet>
+ <directory>${project.basedir}/../bin</directory>
+ <outputDirectory>bin</outputDirectory>
+ <useDefaultExcludes>true</useDefaultExcludes>
+ <fileMode>0755</fileMode>
+ <directoryMode>0755</directoryMode>
+ </fileSet>
+ <fileSet>
+ <directory>${project.basedir}/../examples/bin</directory>
+ <outputDirectory>examples/bin</outputDirectory>
+ <useDefaultExcludes>true</useDefaultExcludes>
+ <fileMode>0755</fileMode>
+ <directoryMode>0755</directoryMode>
+ <excludes>
+ <exclude>work</exclude>
+ <exclude>work/**</exclude>
+ </excludes>
+ </fileSet>
+ </fileSets>
+</assembly>

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/Version.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/Version.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/Version.java
new file mode 100644
index 0000000..5f3c879
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/Version.java
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout;
+
+import com.google.common.base.Charsets;
+import com.google.common.io.Resources;
+
+import java.io.IOException;
+
+public final class Version {
+
+ private Version() {
+ }
+
+ public static String version() {
+ return Version.class.getPackage().getImplementationVersion();
+ }
+
+ public static String versionFromResource() throws IOException {
+ return Resources.toString(Resources.getResource("version"), Charsets.UTF_8);
+ }
+
+ public static void main(String[] args) throws IOException {
+ System.out.println(version() + ' ' + versionFromResource());
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/common/NoSuchItemException.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/common/NoSuchItemException.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/common/NoSuchItemException.java
new file mode 100644
index 0000000..1ac5b72
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/common/NoSuchItemException.java
@@ -0,0 +1,32 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.common;
+
+public final class NoSuchItemException extends TasteException {
+
+ public NoSuchItemException() { }
+
+ public NoSuchItemException(long itemID) {
+ this(String.valueOf(itemID));
+ }
+
+ public NoSuchItemException(String message) {
+ super(message);
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/common/NoSuchUserException.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/common/NoSuchUserException.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/common/NoSuchUserException.java
new file mode 100644
index 0000000..cbb60fa
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/common/NoSuchUserException.java
@@ -0,0 +1,32 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.common;
+
+public final class NoSuchUserException extends TasteException {
+
+ public NoSuchUserException() { }
+
+ public NoSuchUserException(long userID) {
+ this(String.valueOf(userID));
+ }
+
+ public NoSuchUserException(String message) {
+ super(message);
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/common/Refreshable.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/common/Refreshable.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/common/Refreshable.java
new file mode 100644
index 0000000..9b26bee
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/common/Refreshable.java
@@ -0,0 +1,53 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.common;
+
+import java.util.Collection;
+
+/**
+ * <p>
+ * Implementations of this interface have state that can be periodically refreshed. For example, an
+ * implementation instance might contain some pre-computed information that should be periodically refreshed.
+ * The {@link #refresh(Collection)} method triggers such a refresh.
+ * </p>
+ *
+ * <p>
+ * All Taste components implement this. In particular,
+ * {@link org.apache.mahout.cf.taste.recommender.Recommender}s do. Callers may want to call
+ * {@link #refresh(Collection)} periodically to re-compute information throughout the system and bring it up
+ * to date, though this operation may be expensive.
+ * </p>
+ */
+public interface Refreshable {
+
+ /**
+ * <p>
+ * Triggers "refresh" -- whatever that means -- of the implementation. The general contract is that any
+ * {@link Refreshable} should always leave itself in a consistent, operational state, and that the refresh
+ * atomically updates internal state from old to new.
+ * </p>
+ *
+ * @param alreadyRefreshed
+ * {@link org.apache.mahout.cf.taste.common.Refreshable}s that are known to have already been
+ * refreshed as a result of an initial call to a {#refresh(Collection)} method on some
+ * object. This ensure that objects in a refresh dependency graph aren't refreshed twice
+ * needlessly.
+ */
+ void refresh(Collection<Refreshable> alreadyRefreshed);
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/common/TasteException.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/common/TasteException.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/common/TasteException.java
new file mode 100644
index 0000000..1792eff
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/common/TasteException.java
@@ -0,0 +1,41 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.common;
+
+/**
+ * <p>
+ * An exception thrown when an error occurs inside the Taste engine.
+ * </p>
+ */
+public class TasteException extends Exception {
+
+ public TasteException() { }
+
+ public TasteException(String message) {
+ super(message);
+ }
+
+ public TasteException(Throwable cause) {
+ super(cause);
+ }
+
+ public TasteException(String message, Throwable cause) {
+ super(message, cause);
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/common/Weighting.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/common/Weighting.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/common/Weighting.java
new file mode 100644
index 0000000..4e39617
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/common/Weighting.java
@@ -0,0 +1,31 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.common;
+
+/**
+ * <p>
+ * A simple enum which gives symbolic names to the ideas of "weighted" and "unweighted", to make various API
+ * calls which take a weighting parameter more readable.
+ * </p>
+ */
+public enum Weighting {
+
+ WEIGHTED,
+ UNWEIGHTED
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/eval/DataModelBuilder.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/eval/DataModelBuilder.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/eval/DataModelBuilder.java
new file mode 100644
index 0000000..875c65e
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/eval/DataModelBuilder.java
@@ -0,0 +1,45 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.eval;
+
+import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+
+/**
+ * <p>
+ * Implementations of this inner interface are simple helper classes which create a {@link DataModel} to be
+ * used while evaluating a {@link org.apache.mahout.cf.taste.recommender.Recommender}.
+ *
+ * @see RecommenderBuilder
+ * @see RecommenderEvaluator
+ */
+public interface DataModelBuilder {
+
+ /**
+ * <p>
+ * Builds a {@link DataModel} implementation to be used in an evaluation, given training data.
+ * </p>
+ *
+ * @param trainingData
+ * data to be used in the {@link DataModel}
+ * @return {@link DataModel} based upon the given data
+ */
+ DataModel buildDataModel(FastByIDMap<PreferenceArray> trainingData);
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/eval/IRStatistics.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/eval/IRStatistics.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/eval/IRStatistics.java
new file mode 100644
index 0000000..9c442ff
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/eval/IRStatistics.java
@@ -0,0 +1,80 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.eval;
+
+/**
+ * <p>
+ * Implementations encapsulate information retrieval-related statistics about a
+ * {@link org.apache.mahout.cf.taste.recommender.Recommender}'s recommendations.
+ * </p>
+ *
+ * <p>
+ * See <a href="http://en.wikipedia.org/wiki/Information_retrieval">Information retrieval</a>.
+ * </p>
+ */
+public interface IRStatistics {
+
+ /**
+ * <p>
+ * See <a href="http://en.wikipedia.org/wiki/Information_retrieval#Precision">Precision</a>.
+ * </p>
+ */
+ double getPrecision();
+
+ /**
+ * <p>
+ * See <a href="http://en.wikipedia.org/wiki/Information_retrieval#Recall">Recall</a>.
+ * </p>
+ */
+ double getRecall();
+
+ /**
+ * <p>
+ * See <a href="http://en.wikipedia.org/wiki/Information_retrieval#Fall-Out">Fall-Out</a>.
+ * </p>
+ */
+ double getFallOut();
+
+ /**
+ * <p>
+ * See <a href="http://en.wikipedia.org/wiki/Information_retrieval#F-measure">F-measure</a>.
+ * </p>
+ */
+ double getF1Measure();
+
+ /**
+ * <p>
+ * See <a href="http://en.wikipedia.org/wiki/Information_retrieval#F-measure">F-measure</a>.
+ * </p>
+ */
+ double getFNMeasure(double n);
+
+ /**
+ * <p>
+ * See <a href="http://en.wikipedia.org/wiki/Discounted_cumulative_gain#Normalized_DCG">
+ * Normalized Discounted Cumulative Gain</a>.
+ * </p>
+ */
+ double getNormalizedDiscountedCumulativeGain();
+
+ /**
+ * @return the fraction of all users for whom recommendations could be produced
+ */
+ double getReach();
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/eval/RecommenderBuilder.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/eval/RecommenderBuilder.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/eval/RecommenderBuilder.java
new file mode 100644
index 0000000..1805092
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/eval/RecommenderBuilder.java
@@ -0,0 +1,45 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.eval;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.recommender.Recommender;
+
+/**
+ * <p>
+ * Implementations of this inner interface are simple helper classes which create a {@link Recommender} to be
+ * evaluated based on the given {@link DataModel}.
+ * </p>
+ */
+public interface RecommenderBuilder {
+
+ /**
+ * <p>
+ * Builds a {@link Recommender} implementation to be evaluated, using the given {@link DataModel}.
+ * </p>
+ *
+ * @param dataModel
+ * {@link DataModel} to build the {@link Recommender} on
+ * @return {@link Recommender} based upon the given {@link DataModel}
+ * @throws TasteException
+ * if an error occurs while accessing the {@link DataModel}
+ */
+ Recommender buildRecommender(DataModel dataModel) throws TasteException;
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/eval/RecommenderEvaluator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/eval/RecommenderEvaluator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/eval/RecommenderEvaluator.java
new file mode 100644
index 0000000..dcbbcf8
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/eval/RecommenderEvaluator.java
@@ -0,0 +1,105 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.eval;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.model.DataModel;
+
+/**
+ * <p>
+ * Implementations of this interface evaluate the quality of a
+ * {@link org.apache.mahout.cf.taste.recommender.Recommender}'s recommendations.
+ * </p>
+ */
+public interface RecommenderEvaluator {
+
+ /**
+ * <p>
+ * Evaluates the quality of a {@link org.apache.mahout.cf.taste.recommender.Recommender}'s recommendations.
+ * The range of values that may be returned depends on the implementation, but <em>lower</em> values must
+ * mean better recommendations, with 0 being the lowest / best possible evaluation, meaning a perfect match.
+ * This method does not accept a {@link org.apache.mahout.cf.taste.recommender.Recommender} directly, but
+ * rather a {@link RecommenderBuilder} which can build the
+ * {@link org.apache.mahout.cf.taste.recommender.Recommender} to test on top of a given {@link DataModel}.
+ * </p>
+ *
+ * <p>
+ * Implementations will take a certain percentage of the preferences supplied by the given {@link DataModel}
+ * as "training data". This is typically most of the data, like 90%. This data is used to produce
+ * recommendations, and the rest of the data is compared against estimated preference values to see how much
+ * the {@link org.apache.mahout.cf.taste.recommender.Recommender}'s predicted preferences match the user's
+ * real preferences. Specifically, for each user, this percentage of the user's ratings are used to produce
+ * recommendations, and for each user, the remaining preferences are compared against the user's real
+ * preferences.
+ * </p>
+ *
+ * <p>
+ * For large datasets, it may be desirable to only evaluate based on a small percentage of the data.
+ * {@code evaluationPercentage} controls how many of the {@link DataModel}'s users are used in
+ * evaluation.
+ * </p>
+ *
+ * <p>
+ * To be clear, {@code trainingPercentage} and {@code evaluationPercentage} are not related. They
+ * do not need to add up to 1.0, for example.
+ * </p>
+ *
+ * @param recommenderBuilder
+ * object that can build a {@link org.apache.mahout.cf.taste.recommender.Recommender} to test
+ * @param dataModelBuilder
+ * {@link DataModelBuilder} to use, or if null, a default {@link DataModel}
+ * implementation will be used
+ * @param dataModel
+ * dataset to test on
+ * @param trainingPercentage
+ * percentage of each user's preferences to use to produce recommendations; the rest are compared
+ * to estimated preference values to evaluate
+ * {@link org.apache.mahout.cf.taste.recommender.Recommender} performance
+ * @param evaluationPercentage
+ * percentage of users to use in evaluation
+ * @return a "score" representing how well the {@link org.apache.mahout.cf.taste.recommender.Recommender}'s
+ * estimated preferences match real values; <em>lower</em> scores mean a better match and 0 is a
+ * perfect match
+ * @throws TasteException
+ * if an error occurs while accessing the {@link DataModel}
+ */
+ double evaluate(RecommenderBuilder recommenderBuilder,
+ DataModelBuilder dataModelBuilder,
+ DataModel dataModel,
+ double trainingPercentage,
+ double evaluationPercentage) throws TasteException;
+
+ /**
+ * @deprecated see {@link DataModel#getMaxPreference()}
+ */
+ @Deprecated
+ float getMaxPreference();
+
+ @Deprecated
+ void setMaxPreference(float maxPreference);
+
+ /**
+ * @deprecated see {@link DataModel#getMinPreference()}
+ */
+ @Deprecated
+ float getMinPreference();
+
+ @Deprecated
+ void setMinPreference(float minPreference);
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/eval/RecommenderIRStatsEvaluator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/eval/RecommenderIRStatsEvaluator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/eval/RecommenderIRStatsEvaluator.java
new file mode 100644
index 0000000..6e4e9c7
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/eval/RecommenderIRStatsEvaluator.java
@@ -0,0 +1,64 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.eval;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.recommender.IDRescorer;
+
+/**
+ * <p>
+ * Implementations collect information retrieval-related statistics on a
+ * {@link org.apache.mahout.cf.taste.recommender.Recommender}'s performance, including precision, recall and
+ * f-measure.
+ * </p>
+ *
+ * <p>
+ * See <a href="http://en.wikipedia.org/wiki/Information_retrieval">Information retrieval</a>.
+ */
+public interface RecommenderIRStatsEvaluator {
+
+ /**
+ * @param recommenderBuilder
+ * object that can build a {@link org.apache.mahout.cf.taste.recommender.Recommender} to test
+ * @param dataModelBuilder
+ * {@link DataModelBuilder} to use, or if null, a default {@link DataModel} implementation will be
+ * used
+ * @param dataModel
+ * dataset to test on
+ * @param rescorer
+ * if any, to use when computing recommendations
+ * @param at
+ * as in, "precision at 5". The number of recommendations to consider when evaluating precision,
+ * etc.
+ * @param relevanceThreshold
+ * items whose preference value is at least this value are considered "relevant" for the purposes
+ * of computations
+ * @return {@link IRStatistics} with resulting precision, recall, etc.
+ * @throws TasteException
+ * if an error occurs while accessing the {@link DataModel}
+ */
+ IRStatistics evaluate(RecommenderBuilder recommenderBuilder,
+ DataModelBuilder dataModelBuilder,
+ DataModel dataModel,
+ IDRescorer rescorer,
+ int at,
+ double relevanceThreshold,
+ double evaluationPercentage) throws TasteException;
+
+}
r***@apache.org
2018-06-28 14:54:59 UTC
Permalink
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/resources/bank-full.csv
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/resources/bank-full.csv b/community/mahout-mr/mr-examples/src/main/resources/bank-full.csv
new file mode 100644
index 0000000..d7a2ede
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/resources/bank-full.csv
@@ -0,0 +1,45212 @@
+"age";"job";"marital";"education";"default";"balance";"housing";"loan";"contact";"day";"month";"duration";"campaign";"pdays";"previous";"poutcome";"y"
+58;"management";"married";"tertiary";"no";2143;"yes";"no";"unknown";5;"may";261;1;-1;0;"unknown";"no"
+44;"technician";"single";"secondary";"no";29;"yes";"no";"unknown";5;"may";151;1;-1;0;"unknown";"no"
+33;"entrepreneur";"married";"secondary";"no";2;"yes";"yes";"unknown";5;"may";76;1;-1;0;"unknown";"no"
+47;"blue-collar";"married";"unknown";"no";1506;"yes";"no";"unknown";5;"may";92;1;-1;0;"unknown";"no"
+33;"unknown";"single";"unknown";"no";1;"no";"no";"unknown";5;"may";198;1;-1;0;"unknown";"no"
+35;"management";"married";"tertiary";"no";231;"yes";"no";"unknown";5;"may";139;1;-1;0;"unknown";"no"
+28;"management";"single";"tertiary";"no";447;"yes";"yes";"unknown";5;"may";217;1;-1;0;"unknown";"no"
+42;"entrepreneur";"divorced";"tertiary";"yes";2;"yes";"no";"unknown";5;"may";380;1;-1;0;"unknown";"no"
+58;"retired";"married";"primary";"no";121;"yes";"no";"unknown";5;"may";50;1;-1;0;"unknown";"no"
+43;"technician";"single";"secondary";"no";593;"yes";"no";"unknown";5;"may";55;1;-1;0;"unknown";"no"
+41;"admin.";"divorced";"secondary";"no";270;"yes";"no";"unknown";5;"may";222;1;-1;0;"unknown";"no"
+29;"admin.";"single";"secondary";"no";390;"yes";"no";"unknown";5;"may";137;1;-1;0;"unknown";"no"
+53;"technician";"married";"secondary";"no";6;"yes";"no";"unknown";5;"may";517;1;-1;0;"unknown";"no"
+58;"technician";"married";"unknown";"no";71;"yes";"no";"unknown";5;"may";71;1;-1;0;"unknown";"no"
+57;"services";"married";"secondary";"no";162;"yes";"no";"unknown";5;"may";174;1;-1;0;"unknown";"no"
+51;"retired";"married";"primary";"no";229;"yes";"no";"unknown";5;"may";353;1;-1;0;"unknown";"no"
+45;"admin.";"single";"unknown";"no";13;"yes";"no";"unknown";5;"may";98;1;-1;0;"unknown";"no"
+57;"blue-collar";"married";"primary";"no";52;"yes";"no";"unknown";5;"may";38;1;-1;0;"unknown";"no"
+60;"retired";"married";"primary";"no";60;"yes";"no";"unknown";5;"may";219;1;-1;0;"unknown";"no"
+33;"services";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";54;1;-1;0;"unknown";"no"
+28;"blue-collar";"married";"secondary";"no";723;"yes";"yes";"unknown";5;"may";262;1;-1;0;"unknown";"no"
+56;"management";"married";"tertiary";"no";779;"yes";"no";"unknown";5;"may";164;1;-1;0;"unknown";"no"
+32;"blue-collar";"single";"primary";"no";23;"yes";"yes";"unknown";5;"may";160;1;-1;0;"unknown";"no"
+25;"services";"married";"secondary";"no";50;"yes";"no";"unknown";5;"may";342;1;-1;0;"unknown";"no"
+40;"retired";"married";"primary";"no";0;"yes";"yes";"unknown";5;"may";181;1;-1;0;"unknown";"no"
+44;"admin.";"married";"secondary";"no";-372;"yes";"no";"unknown";5;"may";172;1;-1;0;"unknown";"no"
+39;"management";"single";"tertiary";"no";255;"yes";"no";"unknown";5;"may";296;1;-1;0;"unknown";"no"
+52;"entrepreneur";"married";"secondary";"no";113;"yes";"yes";"unknown";5;"may";127;1;-1;0;"unknown";"no"
+46;"management";"single";"secondary";"no";-246;"yes";"no";"unknown";5;"may";255;2;-1;0;"unknown";"no"
+36;"technician";"single";"secondary";"no";265;"yes";"yes";"unknown";5;"may";348;1;-1;0;"unknown";"no"
+57;"technician";"married";"secondary";"no";839;"no";"yes";"unknown";5;"may";225;1;-1;0;"unknown";"no"
+49;"management";"married";"tertiary";"no";378;"yes";"no";"unknown";5;"may";230;1;-1;0;"unknown";"no"
+60;"admin.";"married";"secondary";"no";39;"yes";"yes";"unknown";5;"may";208;1;-1;0;"unknown";"no"
+59;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";226;1;-1;0;"unknown";"no"
+51;"management";"married";"tertiary";"no";10635;"yes";"no";"unknown";5;"may";336;1;-1;0;"unknown";"no"
+57;"technician";"divorced";"secondary";"no";63;"yes";"no";"unknown";5;"may";242;1;-1;0;"unknown";"no"
+25;"blue-collar";"married";"secondary";"no";-7;"yes";"no";"unknown";5;"may";365;1;-1;0;"unknown";"no"
+53;"technician";"married";"secondary";"no";-3;"no";"no";"unknown";5;"may";1666;1;-1;0;"unknown";"no"
+36;"admin.";"divorced";"secondary";"no";506;"yes";"no";"unknown";5;"may";577;1;-1;0;"unknown";"no"
+37;"admin.";"single";"secondary";"no";0;"yes";"no";"unknown";5;"may";137;1;-1;0;"unknown";"no"
+44;"services";"divorced";"secondary";"no";2586;"yes";"no";"unknown";5;"may";160;1;-1;0;"unknown";"no"
+50;"management";"married";"secondary";"no";49;"yes";"no";"unknown";5;"may";180;2;-1;0;"unknown";"no"
+60;"blue-collar";"married";"unknown";"no";104;"yes";"no";"unknown";5;"may";22;1;-1;0;"unknown";"no"
+54;"retired";"married";"secondary";"no";529;"yes";"no";"unknown";5;"may";1492;1;-1;0;"unknown";"no"
+58;"retired";"married";"unknown";"no";96;"yes";"no";"unknown";5;"may";616;1;-1;0;"unknown";"no"
+36;"admin.";"single";"primary";"no";-171;"yes";"no";"unknown";5;"may";242;1;-1;0;"unknown";"no"
+58;"self-employed";"married";"tertiary";"no";-364;"yes";"no";"unknown";5;"may";355;1;-1;0;"unknown";"no"
+44;"technician";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";225;2;-1;0;"unknown";"no"
+55;"technician";"divorced";"secondary";"no";0;"no";"no";"unknown";5;"may";160;1;-1;0;"unknown";"no"
+29;"management";"single";"tertiary";"no";0;"yes";"no";"unknown";5;"may";363;1;-1;0;"unknown";"no"
+54;"blue-collar";"married";"secondary";"no";1291;"yes";"no";"unknown";5;"may";266;1;-1;0;"unknown";"no"
+48;"management";"divorced";"tertiary";"no";-244;"yes";"no";"unknown";5;"may";253;1;-1;0;"unknown";"no"
+32;"management";"married";"tertiary";"no";0;"yes";"no";"unknown";5;"may";179;1;-1;0;"unknown";"no"
+42;"admin.";"single";"secondary";"no";-76;"yes";"no";"unknown";5;"may";787;1;-1;0;"unknown";"no"
+24;"technician";"single";"secondary";"no";-103;"yes";"yes";"unknown";5;"may";145;1;-1;0;"unknown";"no"
+38;"entrepreneur";"single";"tertiary";"no";243;"no";"yes";"unknown";5;"may";174;1;-1;0;"unknown";"no"
+38;"management";"single";"tertiary";"no";424;"yes";"no";"unknown";5;"may";104;1;-1;0;"unknown";"no"
+47;"blue-collar";"married";"unknown";"no";306;"yes";"no";"unknown";5;"may";13;1;-1;0;"unknown";"no"
+40;"blue-collar";"single";"unknown";"no";24;"yes";"no";"unknown";5;"may";185;1;-1;0;"unknown";"no"
+46;"services";"married";"primary";"no";179;"yes";"no";"unknown";5;"may";1778;1;-1;0;"unknown";"no"
+32;"admin.";"married";"tertiary";"no";0;"yes";"no";"unknown";5;"may";138;1;-1;0;"unknown";"no"
+53;"technician";"divorced";"secondary";"no";989;"yes";"no";"unknown";5;"may";812;1;-1;0;"unknown";"no"
+57;"blue-collar";"married";"primary";"no";249;"yes";"no";"unknown";5;"may";164;1;-1;0;"unknown";"no"
+33;"services";"married";"secondary";"no";790;"yes";"no";"unknown";5;"may";391;1;-1;0;"unknown";"no"
+49;"blue-collar";"married";"unknown";"no";154;"yes";"no";"unknown";5;"may";357;1;-1;0;"unknown";"no"
+51;"management";"married";"tertiary";"no";6530;"yes";"no";"unknown";5;"may";91;1;-1;0;"unknown";"no"
+60;"retired";"married";"tertiary";"no";100;"no";"no";"unknown";5;"may";528;1;-1;0;"unknown";"no"
+59;"management";"divorced";"tertiary";"no";59;"yes";"no";"unknown";5;"may";273;1;-1;0;"unknown";"no"
+55;"technician";"married";"secondary";"no";1205;"yes";"no";"unknown";5;"may";158;2;-1;0;"unknown";"no"
+35;"blue-collar";"single";"secondary";"no";12223;"yes";"yes";"unknown";5;"may";177;1;-1;0;"unknown";"no"
+57;"blue-collar";"married";"secondary";"no";5935;"yes";"yes";"unknown";5;"may";258;1;-1;0;"unknown";"no"
+31;"services";"married";"secondary";"no";25;"yes";"yes";"unknown";5;"may";172;1;-1;0;"unknown";"no"
+54;"management";"married";"secondary";"no";282;"yes";"yes";"unknown";5;"may";154;1;-1;0;"unknown";"no"
+55;"blue-collar";"married";"primary";"no";23;"yes";"no";"unknown";5;"may";291;1;-1;0;"unknown";"no"
+43;"technician";"married";"secondary";"no";1937;"yes";"no";"unknown";5;"may";181;1;-1;0;"unknown";"no"
+53;"technician";"married";"secondary";"no";384;"yes";"no";"unknown";5;"may";176;1;-1;0;"unknown";"no"
+44;"blue-collar";"married";"secondary";"no";582;"no";"yes";"unknown";5;"may";211;1;-1;0;"unknown";"no"
+55;"services";"divorced";"secondary";"no";91;"no";"no";"unknown";5;"may";349;1;-1;0;"unknown";"no"
+49;"services";"divorced";"secondary";"no";0;"yes";"yes";"unknown";5;"may";272;1;-1;0;"unknown";"no"
+55;"services";"divorced";"secondary";"yes";1;"yes";"no";"unknown";5;"may";208;1;-1;0;"unknown";"no"
+45;"admin.";"single";"secondary";"no";206;"yes";"no";"unknown";5;"may";193;1;-1;0;"unknown";"no"
+47;"services";"divorced";"secondary";"no";164;"no";"no";"unknown";5;"may";212;1;-1;0;"unknown";"no"
+42;"technician";"single";"secondary";"no";690;"yes";"no";"unknown";5;"may";20;1;-1;0;"unknown";"no"
+59;"admin.";"married";"secondary";"no";2343;"yes";"no";"unknown";5;"may";1042;1;-1;0;"unknown";"yes"
+46;"self-employed";"married";"tertiary";"no";137;"yes";"yes";"unknown";5;"may";246;1;-1;0;"unknown";"no"
+51;"blue-collar";"married";"primary";"no";173;"yes";"no";"unknown";5;"may";529;2;-1;0;"unknown";"no"
+56;"admin.";"married";"secondary";"no";45;"no";"no";"unknown";5;"may";1467;1;-1;0;"unknown";"yes"
+41;"technician";"married";"secondary";"no";1270;"yes";"no";"unknown";5;"may";1389;1;-1;0;"unknown";"yes"
+46;"management";"divorced";"secondary";"no";16;"yes";"yes";"unknown";5;"may";188;2;-1;0;"unknown";"no"
+57;"retired";"married";"secondary";"no";486;"yes";"no";"unknown";5;"may";180;2;-1;0;"unknown";"no"
+42;"management";"single";"secondary";"no";50;"no";"no";"unknown";5;"may";48;1;-1;0;"unknown";"no"
+30;"technician";"married";"secondary";"no";152;"yes";"yes";"unknown";5;"may";213;2;-1;0;"unknown";"no"
+60;"admin.";"married";"secondary";"no";290;"yes";"no";"unknown";5;"may";583;1;-1;0;"unknown";"no"
+60;"blue-collar";"married";"unknown";"no";54;"yes";"no";"unknown";5;"may";221;1;-1;0;"unknown";"no"
+57;"entrepreneur";"divorced";"secondary";"no";-37;"no";"no";"unknown";5;"may";173;1;-1;0;"unknown";"no"
+36;"management";"married";"tertiary";"no";101;"yes";"yes";"unknown";5;"may";426;1;-1;0;"unknown";"no"
+55;"blue-collar";"married";"secondary";"no";383;"no";"no";"unknown";5;"may";287;1;-1;0;"unknown";"no"
+60;"retired";"married";"tertiary";"no";81;"yes";"no";"unknown";5;"may";101;1;-1;0;"unknown";"no"
+39;"technician";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";203;1;-1;0;"unknown";"no"
+46;"management";"married";"tertiary";"no";229;"yes";"no";"unknown";5;"may";197;1;-1;0;"unknown";"no"
+44;"blue-collar";"married";"secondary";"no";-674;"yes";"no";"unknown";5;"may";257;1;-1;0;"unknown";"no"
+53;"blue-collar";"married";"primary";"no";90;"no";"no";"unknown";5;"may";124;1;-1;0;"unknown";"no"
+52;"blue-collar";"married";"primary";"no";128;"yes";"no";"unknown";5;"may";229;1;-1;0;"unknown";"no"
+59;"blue-collar";"married";"primary";"no";179;"yes";"no";"unknown";5;"may";55;3;-1;0;"unknown";"no"
+27;"technician";"single";"tertiary";"no";0;"yes";"no";"unknown";5;"may";400;1;-1;0;"unknown";"no"
+44;"blue-collar";"married";"secondary";"no";54;"yes";"no";"unknown";5;"may";197;1;-1;0;"unknown";"no"
+47;"technician";"married";"tertiary";"no";151;"yes";"no";"unknown";5;"may";190;1;-1;0;"unknown";"no"
+34;"admin.";"married";"secondary";"no";61;"no";"yes";"unknown";5;"may";21;1;-1;0;"unknown";"no"
+59;"retired";"single";"secondary";"no";30;"yes";"no";"unknown";5;"may";514;1;-1;0;"unknown";"no"
+45;"management";"married";"tertiary";"no";523;"yes";"no";"unknown";5;"may";849;2;-1;0;"unknown";"no"
+29;"services";"divorced";"secondary";"no";31;"yes";"no";"unknown";5;"may";194;1;-1;0;"unknown";"no"
+46;"technician";"divorced";"secondary";"no";79;"no";"no";"unknown";5;"may";144;1;-1;0;"unknown";"no"
+56;"self-employed";"married";"primary";"no";-34;"yes";"yes";"unknown";5;"may";212;2;-1;0;"unknown";"no"
+36;"blue-collar";"married";"primary";"no";448;"yes";"no";"unknown";5;"may";286;1;-1;0;"unknown";"no"
+59;"retired";"divorced";"primary";"no";81;"yes";"no";"unknown";5;"may";107;1;-1;0;"unknown";"no"
+44;"blue-collar";"married";"secondary";"no";144;"yes";"no";"unknown";5;"may";247;2;-1;0;"unknown";"no"
+41;"admin.";"married";"secondary";"no";351;"yes";"no";"unknown";5;"may";518;1;-1;0;"unknown";"no"
+33;"management";"single";"tertiary";"no";-67;"yes";"no";"unknown";5;"may";364;1;-1;0;"unknown";"no"
+59;"management";"divorced";"tertiary";"no";262;"no";"no";"unknown";5;"may";178;1;-1;0;"unknown";"no"
+57;"technician";"married";"primary";"no";0;"no";"no";"unknown";5;"may";98;1;-1;0;"unknown";"no"
+56;"technician";"divorced";"unknown";"no";56;"yes";"no";"unknown";5;"may";439;1;-1;0;"unknown";"no"
+51;"blue-collar";"married";"secondary";"no";26;"yes";"no";"unknown";5;"may";79;1;-1;0;"unknown";"no"
+34;"admin.";"married";"unknown";"no";3;"yes";"no";"unknown";5;"may";120;3;-1;0;"unknown";"no"
+43;"services";"married";"secondary";"no";41;"yes";"yes";"unknown";5;"may";127;2;-1;0;"unknown";"no"
+52;"technician";"married";"tertiary";"no";7;"no";"yes";"unknown";5;"may";175;1;-1;0;"unknown";"no"
+33;"technician";"single";"secondary";"no";105;"yes";"no";"unknown";5;"may";262;2;-1;0;"unknown";"no"
+29;"admin.";"single";"secondary";"no";818;"yes";"yes";"unknown";5;"may";61;1;-1;0;"unknown";"no"
+34;"services";"married";"secondary";"no";-16;"yes";"yes";"unknown";5;"may";78;1;-1;0;"unknown";"no"
+31;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";143;1;-1;0;"unknown";"no"
+55;"services";"married";"secondary";"no";2476;"yes";"no";"unknown";5;"may";579;1;-1;0;"unknown";"yes"
+55;"management";"married";"unknown";"no";1185;"no";"no";"unknown";5;"may";677;1;-1;0;"unknown";"no"
+32;"admin.";"single";"secondary";"no";217;"yes";"no";"unknown";5;"may";345;1;-1;0;"unknown";"no"
+38;"technician";"single";"secondary";"no";1685;"yes";"no";"unknown";5;"may";185;1;-1;0;"unknown";"no"
+55;"admin.";"single";"secondary";"no";802;"yes";"yes";"unknown";5;"may";100;2;-1;0;"unknown";"no"
+28;"unemployed";"single";"tertiary";"no";0;"yes";"no";"unknown";5;"may";125;2;-1;0;"unknown";"no"
+23;"blue-collar";"married";"secondary";"no";94;"yes";"no";"unknown";5;"may";193;1;-1;0;"unknown";"no"
+32;"technician";"single";"secondary";"no";0;"yes";"no";"unknown";5;"may";136;1;-1;0;"unknown";"no"
+43;"services";"single";"unknown";"no";0;"no";"no";"unknown";5;"may";73;1;-1;0;"unknown";"no"
+32;"blue-collar";"married";"secondary";"no";517;"yes";"no";"unknown";5;"may";528;1;-1;0;"unknown";"no"
+46;"blue-collar";"married";"secondary";"no";265;"yes";"no";"unknown";5;"may";541;1;-1;0;"unknown";"no"
+53;"housemaid";"divorced";"primary";"no";947;"yes";"no";"unknown";5;"may";163;1;-1;0;"unknown";"no"
+34;"self-employed";"single";"secondary";"no";3;"yes";"no";"unknown";5;"may";301;1;-1;0;"unknown";"no"
+57;"unemployed";"married";"tertiary";"no";42;"no";"no";"unknown";5;"may";46;1;-1;0;"unknown";"no"
+37;"blue-collar";"married";"secondary";"no";37;"yes";"no";"unknown";5;"may";204;1;-1;0;"unknown";"no"
+59;"blue-collar";"married";"secondary";"no";57;"yes";"no";"unknown";5;"may";98;1;-1;0;"unknown";"no"
+33;"services";"married";"secondary";"no";22;"yes";"no";"unknown";5;"may";71;1;-1;0;"unknown";"no"
+56;"blue-collar";"divorced";"primary";"no";8;"yes";"no";"unknown";5;"may";157;2;-1;0;"unknown";"no"
+48;"unemployed";"married";"secondary";"no";293;"yes";"no";"unknown";5;"may";243;1;-1;0;"unknown";"no"
+43;"services";"married";"primary";"no";3;"yes";"no";"unknown";5;"may";186;2;-1;0;"unknown";"no"
+54;"blue-collar";"married";"primary";"no";348;"yes";"no";"unknown";5;"may";579;2;-1;0;"unknown";"no"
+51;"blue-collar";"married";"unknown";"no";-19;"yes";"no";"unknown";5;"may";163;2;-1;0;"unknown";"no"
+26;"student";"single";"secondary";"no";0;"yes";"no";"unknown";5;"may";610;2;-1;0;"unknown";"no"
+40;"management";"married";"tertiary";"no";-4;"yes";"no";"unknown";5;"may";2033;1;-1;0;"unknown";"no"
+39;"management";"married";"secondary";"no";18;"yes";"no";"unknown";5;"may";85;1;-1;0;"unknown";"no"
+50;"technician";"married";"primary";"no";139;"no";"no";"unknown";5;"may";114;2;-1;0;"unknown";"no"
+41;"services";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";114;2;-1;0;"unknown";"no"
+51;"blue-collar";"married";"unknown";"no";1883;"yes";"no";"unknown";5;"may";57;1;-1;0;"unknown";"no"
+60;"retired";"divorced";"secondary";"no";216;"yes";"no";"unknown";5;"may";238;1;-1;0;"unknown";"no"
+52;"blue-collar";"married";"secondary";"no";782;"yes";"no";"unknown";5;"may";93;3;-1;0;"unknown";"no"
+48;"blue-collar";"married";"secondary";"no";904;"yes";"no";"unknown";5;"may";128;2;-1;0;"unknown";"no"
+48;"services";"married";"unknown";"no";1705;"yes";"no";"unknown";5;"may";107;1;-1;0;"unknown";"no"
+39;"technician";"single";"tertiary";"no";47;"yes";"no";"unknown";5;"may";181;1;-1;0;"unknown";"no"
+47;"services";"single";"secondary";"no";176;"yes";"no";"unknown";5;"may";303;2;-1;0;"unknown";"no"
+40;"blue-collar";"married";"secondary";"no";1225;"yes";"no";"unknown";5;"may";558;5;-1;0;"unknown";"no"
+45;"technician";"married";"secondary";"no";86;"yes";"no";"unknown";5;"may";270;1;-1;0;"unknown";"no"
+26;"admin.";"single";"secondary";"no";82;"yes";"no";"unknown";5;"may";228;1;-1;0;"unknown";"no"
+52;"management";"married";"tertiary";"no";271;"yes";"no";"unknown";5;"may";99;1;-1;0;"unknown";"no"
+54;"technician";"married";"secondary";"no";1378;"yes";"no";"unknown";5;"may";240;1;-1;0;"unknown";"no"
+54;"admin.";"married";"tertiary";"no";184;"no";"no";"unknown";5;"may";673;2;-1;0;"unknown";"yes"
+50;"blue-collar";"married";"primary";"no";0;"no";"no";"unknown";5;"may";233;3;-1;0;"unknown";"no"
+35;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";1056;1;-1;0;"unknown";"no"
+44;"services";"married";"secondary";"no";1357;"yes";"yes";"unknown";5;"may";250;1;-1;0;"unknown";"no"
+53;"entrepreneur";"married";"unknown";"no";19;"yes";"no";"unknown";5;"may";252;1;-1;0;"unknown";"no"
+35;"retired";"single";"primary";"no";434;"no";"no";"unknown";5;"may";138;1;-1;0;"unknown";"no"
+60;"admin.";"divorced";"secondary";"no";92;"yes";"no";"unknown";5;"may";130;1;-1;0;"unknown";"no"
+53;"admin.";"divorced";"secondary";"no";1151;"yes";"no";"unknown";5;"may";412;1;-1;0;"unknown";"no"
+48;"unemployed";"married";"secondary";"no";41;"yes";"no";"unknown";5;"may";179;2;-1;0;"unknown";"no"
+34;"technician";"married";"secondary";"no";51;"yes";"no";"unknown";5;"may";19;2;-1;0;"unknown";"no"
+54;"services";"married";"secondary";"no";214;"yes";"no";"unknown";5;"may";458;2;-1;0;"unknown";"no"
+51;"management";"married";"secondary";"no";1161;"yes";"no";"unknown";5;"may";717;1;-1;0;"unknown";"no"
+31;"services";"married";"tertiary";"no";37;"yes";"no";"unknown";5;"may";313;1;-1;0;"unknown";"no"
+35;"technician";"divorced";"secondary";"no";787;"yes";"no";"unknown";5;"may";683;2;-1;0;"unknown";"no"
+35;"services";"married";"secondary";"no";59;"yes";"no";"unknown";5;"may";1077;1;-1;0;"unknown";"no"
+38;"technician";"married";"secondary";"no";253;"yes";"no";"unknown";5;"may";416;1;-1;0;"unknown";"no"
+36;"admin.";"married";"tertiary";"no";211;"yes";"no";"unknown";5;"may";146;2;-1;0;"unknown";"no"
+58;"retired";"married";"primary";"no";235;"yes";"no";"unknown";5;"may";167;1;-1;0;"unknown";"no"
+40;"services";"divorced";"unknown";"no";4384;"yes";"no";"unknown";5;"may";315;1;-1;0;"unknown";"no"
+54;"management";"married";"secondary";"no";4080;"no";"no";"unknown";5;"may";140;1;-1;0;"unknown";"no"
+34;"blue-collar";"single";"secondary";"no";53;"yes";"yes";"unknown";5;"may";346;1;-1;0;"unknown";"no"
+31;"management";"single";"tertiary";"no";0;"yes";"no";"unknown";5;"may";562;1;-1;0;"unknown";"no"
+51;"retired";"married";"secondary";"no";2127;"yes";"no";"unknown";5;"may";172;1;-1;0;"unknown";"no"
+33;"management";"married";"tertiary";"no";377;"yes";"no";"unknown";5;"may";217;1;-1;0;"unknown";"no"
+55;"management";"married";"tertiary";"no";73;"yes";"no";"unknown";5;"may";142;2;-1;0;"unknown";"no"
+42;"admin.";"married";"secondary";"no";445;"yes";"no";"unknown";5;"may";67;1;-1;0;"unknown";"no"
+34;"blue-collar";"married";"secondary";"no";243;"yes";"no";"unknown";5;"may";291;1;-1;0;"unknown";"no"
+33;"blue-collar";"single";"secondary";"no";307;"yes";"no";"unknown";5;"may";309;2;-1;0;"unknown";"no"
+38;"services";"married";"secondary";"no";155;"yes";"no";"unknown";5;"may";248;1;-1;0;"unknown";"no"
+50;"technician";"divorced";"tertiary";"no";173;"no";"yes";"unknown";5;"may";98;1;-1;0;"unknown";"no"
+43;"management";"married";"tertiary";"no";400;"yes";"no";"unknown";5;"may";256;1;-1;0;"unknown";"no"
+61;"blue-collar";"divorced";"primary";"no";1428;"yes";"no";"unknown";5;"may";82;2;-1;0;"unknown";"no"
+47;"admin.";"married";"secondary";"no";219;"yes";"no";"unknown";5;"may";577;1;-1;0;"unknown";"no"
+48;"self-employed";"married";"tertiary";"no";7;"yes";"no";"unknown";5;"may";286;1;-1;0;"unknown";"no"
+44;"blue-collar";"married";"secondary";"no";575;"yes";"no";"unknown";5;"may";477;1;-1;0;"unknown";"no"
+35;"student";"single";"unknown";"no";298;"yes";"no";"unknown";5;"may";611;2;-1;0;"unknown";"no"
+35;"services";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";471;1;-1;0;"unknown";"no"
+50;"services";"married";"secondary";"no";5699;"yes";"no";"unknown";5;"may";381;2;-1;0;"unknown";"no"
+41;"management";"married";"tertiary";"no";176;"yes";"yes";"unknown";5;"may";42;1;-1;0;"unknown";"no"
+41;"management";"married";"tertiary";"no";517;"yes";"no";"unknown";5;"may";251;1;-1;0;"unknown";"no"
+39;"services";"single";"unknown";"no";257;"yes";"no";"unknown";5;"may";408;1;-1;0;"unknown";"no"
+42;"retired";"married";"secondary";"no";56;"yes";"no";"unknown";5;"may";215;1;-1;0;"unknown";"no"
+38;"blue-collar";"married";"secondary";"no";-390;"yes";"no";"unknown";5;"may";287;1;-1;0;"unknown";"no"
+53;"retired";"married";"secondary";"no";330;"yes";"no";"unknown";5;"may";216;2;-1;0;"unknown";"no"
+59;"housemaid";"divorced";"primary";"no";195;"no";"no";"unknown";5;"may";366;2;-1;0;"unknown";"no"
+36;"services";"married";"secondary";"no";301;"yes";"no";"unknown";5;"may";210;1;-1;0;"unknown";"no"
+54;"blue-collar";"married";"primary";"no";-41;"yes";"no";"unknown";5;"may";288;1;-1;0;"unknown";"no"
+40;"technician";"married";"tertiary";"no";483;"yes";"no";"unknown";5;"may";168;1;-1;0;"unknown";"no"
+47;"unknown";"married";"unknown";"no";28;"no";"no";"unknown";5;"may";338;2;-1;0;"unknown";"no"
+53;"unemployed";"married";"unknown";"no";13;"no";"no";"unknown";5;"may";410;3;-1;0;"unknown";"no"
+46;"housemaid";"married";"primary";"no";965;"no";"no";"unknown";5;"may";177;1;-1;0;"unknown";"no"
+39;"management";"married";"tertiary";"no";378;"yes";"yes";"unknown";5;"may";127;2;-1;0;"unknown";"no"
+40;"unemployed";"married";"secondary";"no";219;"yes";"no";"unknown";5;"may";357;1;-1;0;"unknown";"no"
+28;"blue-collar";"married";"primary";"no";324;"yes";"no";"unknown";5;"may";175;1;-1;0;"unknown";"no"
+35;"entrepreneur";"divorced";"secondary";"no";-69;"yes";"no";"unknown";5;"may";300;1;-1;0;"unknown";"no"
+55;"retired";"married";"secondary";"no";0;"no";"yes";"unknown";5;"may";136;1;-1;0;"unknown";"no"
+43;"technician";"divorced";"unknown";"no";205;"yes";"no";"unknown";5;"may";1419;1;-1;0;"unknown";"no"
+48;"blue-collar";"married";"primary";"no";278;"yes";"no";"unknown";5;"may";125;2;-1;0;"unknown";"no"
+58;"management";"married";"unknown";"no";1065;"yes";"no";"unknown";5;"may";213;3;-1;0;"unknown";"no"
+33;"management";"single";"tertiary";"no";34;"yes";"no";"unknown";5;"may";27;1;-1;0;"unknown";"no"
+36;"blue-collar";"married";"unknown";"no";1033;"no";"no";"unknown";5;"may";238;2;-1;0;"unknown";"no"
+53;"services";"divorced";"secondary";"no";1467;"yes";"no";"unknown";5;"may";124;1;-1;0;"unknown";"no"
+47;"blue-collar";"married";"primary";"no";-12;"yes";"no";"unknown";5;"may";18;1;-1;0;"unknown";"no"
+31;"services";"married";"secondary";"no";388;"yes";"no";"unknown";5;"may";730;2;-1;0;"unknown";"no"
+57;"entrepreneur";"married";"secondary";"no";294;"yes";"no";"unknown";5;"may";746;2;-1;0;"unknown";"no"
+53;"blue-collar";"married";"unknown";"no";1827;"no";"no";"unknown";5;"may";121;1;-1;0;"unknown";"no"
+55;"blue-collar";"married";"primary";"no";627;"yes";"no";"unknown";5;"may";247;1;-1;0;"unknown";"no"
+45;"blue-collar";"married";"primary";"no";25;"yes";"no";"unknown";5;"may";40;1;-1;0;"unknown";"no"
+53;"admin.";"divorced";"secondary";"no";315;"yes";"no";"unknown";5;"may";181;2;-1;0;"unknown";"no"
+37;"blue-collar";"married";"primary";"no";0;"yes";"no";"unknown";5;"may";79;1;-1;0;"unknown";"no"
+44;"admin.";"divorced";"secondary";"no";66;"yes";"no";"unknown";5;"may";206;1;-1;0;"unknown";"no"
+49;"blue-collar";"divorced";"primary";"no";-9;"yes";"yes";"unknown";5;"may";389;1;-1;0;"unknown";"no"
+46;"technician";"married";"secondary";"no";349;"yes";"yes";"unknown";5;"may";127;1;-1;0;"unknown";"no"
+43;"entrepreneur";"married";"unknown";"no";100;"yes";"no";"unknown";5;"may";702;1;-1;0;"unknown";"no"
+38;"admin.";"married";"secondary";"no";0;"no";"no";"unknown";5;"may";151;1;-1;0;"unknown";"no"
+43;"technician";"married";"secondary";"no";434;"yes";"no";"unknown";5;"may";117;1;-1;0;"unknown";"no"
+49;"management";"married";"tertiary";"no";3237;"yes";"no";"unknown";5;"may";232;3;-1;0;"unknown";"no"
+42;"management";"married";"unknown";"no";275;"no";"no";"unknown";5;"may";408;2;-1;0;"unknown";"no"
+22;"blue-collar";"single";"secondary";"no";0;"yes";"no";"unknown";5;"may";179;2;-1;0;"unknown";"no"
+40;"management";"married";"tertiary";"no";207;"yes";"no";"unknown";5;"may";39;1;-1;0;"unknown";"no"
+39;"blue-collar";"married";"secondary";"no";483;"yes";"no";"unknown";5;"may";282;1;-1;0;"unknown";"no"
+51;"services";"married";"secondary";"no";2248;"yes";"no";"unknown";5;"may";714;2;-1;0;"unknown";"no"
+49;"admin.";"married";"secondary";"no";428;"yes";"no";"unknown";5;"may";50;1;-1;0;"unknown";"no"
+53;"blue-collar";"married";"secondary";"no";0;"yes";"yes";"unknown";5;"may";181;1;-1;0;"unknown";"no"
+34;"services";"single";"secondary";"no";0;"yes";"no";"unknown";5;"may";142;1;-1;0;"unknown";"no"
+33;"technician";"divorced";"secondary";"no";140;"yes";"no";"unknown";5;"may";227;1;-1;0;"unknown";"no"
+50;"management";"single";"tertiary";"no";297;"yes";"no";"unknown";5;"may";119;1;-1;0;"unknown";"no"
+39;"blue-collar";"married";"primary";"no";279;"yes";"no";"unknown";5;"may";361;1;-1;0;"unknown";"no"
+59;"entrepreneur";"divorced";"secondary";"no";901;"yes";"no";"unknown";5;"may";73;3;-1;0;"unknown";"no"
+30;"technician";"single";"secondary";"no";2573;"yes";"no";"unknown";5;"may";67;2;-1;0;"unknown";"no"
+36;"services";"married";"secondary";"no";143;"yes";"yes";"unknown";5;"may";350;1;-1;0;"unknown";"no"
+40;"blue-collar";"married";"secondary";"no";475;"yes";"no";"unknown";5;"may";332;2;-1;0;"unknown";"no"
+53;"blue-collar";"married";"secondary";"no";70;"yes";"no";"unknown";5;"may";611;2;-1;0;"unknown";"no"
+34;"management";"single";"tertiary";"no";318;"yes";"no";"unknown";5;"may";113;2;-1;0;"unknown";"no"
+37;"technician";"married";"secondary";"no";275;"yes";"no";"unknown";5;"may";132;1;-1;0;"unknown";"no"
+42;"management";"divorced";"tertiary";"no";742;"yes";"no";"unknown";5;"may";58;3;-1;0;"unknown";"no"
+41;"entrepreneur";"married";"primary";"no";236;"yes";"no";"unknown";5;"may";151;1;-1;0;"unknown";"no"
+30;"student";"single";"tertiary";"no";25;"yes";"no";"unknown";5;"may";89;2;-1;0;"unknown";"no"
+37;"management";"single";"tertiary";"no";600;"yes";"no";"unknown";5;"may";152;1;-1;0;"unknown";"no"
+39;"admin.";"divorced";"secondary";"no";-349;"yes";"no";"unknown";5;"may";611;2;-1;0;"unknown";"no"
+41;"blue-collar";"married";"primary";"no";183;"yes";"yes";"unknown";5;"may";110;2;-1;0;"unknown";"no"
+40;"blue-collar";"married";"primary";"no";0;"yes";"no";"unknown";5;"may";463;1;-1;0;"unknown";"no"
+42;"management";"single";"tertiary";"no";0;"yes";"yes";"unknown";5;"may";562;2;-1;0;"unknown";"yes"
+40;"blue-collar";"divorced";"primary";"no";0;"yes";"no";"unknown";5;"may";962;1;-1;0;"unknown";"no"
+57;"technician";"married";"secondary";"no";1078;"yes";"no";"unknown";5;"may";10;4;-1;0;"unknown";"no"
+56;"entrepreneur";"divorced";"secondary";"no";155;"no";"no";"unknown";5;"may";118;3;-1;0;"unknown";"no"
+37;"admin.";"married";"secondary";"no";190;"yes";"no";"unknown";5;"may";92;2;-1;0;"unknown";"no"
+59;"retired";"married";"secondary";"no";319;"yes";"no";"unknown";5;"may";143;3;-1;0;"unknown";"no"
+39;"services";"divorced";"secondary";"no";-185;"yes";"no";"unknown";5;"may";189;3;-1;0;"unknown";"no"
+49;"services";"married";"secondary";"no";47;"no";"no";"unknown";5;"may";234;2;-1;0;"unknown";"no"
+38;"services";"single";"secondary";"no";570;"yes";"no";"unknown";5;"may";75;2;-1;0;"unknown";"no"
+36;"self-employed";"married";"tertiary";"no";19;"no";"no";"unknown";5;"may";189;2;-1;0;"unknown";"no"
+50;"blue-collar";"married";"primary";"no";61;"yes";"no";"unknown";5;"may";621;3;-1;0;"unknown";"no"
+41;"admin.";"married";"secondary";"no";-62;"yes";"yes";"unknown";5;"may";55;2;-1;0;"unknown";"no"
+54;"technician";"married";"tertiary";"no";258;"no";"no";"unknown";5;"may";310;4;-1;0;"unknown";"no"
+58;"blue-collar";"married";"primary";"no";76;"yes";"no";"unknown";5;"may";156;2;-1;0;"unknown";"no"
+30;"blue-collar";"single";"secondary";"no";0;"yes";"no";"unknown";5;"may";5;2;-1;0;"unknown";"no"
+33;"admin.";"single";"secondary";"no";352;"yes";"no";"unknown";5;"may";225;2;-1;0;"unknown";"no"
+47;"admin.";"married";"secondary";"no";368;"yes";"no";"unknown";5;"may";125;2;-1;0;"unknown";"no"
+50;"technician";"single";"tertiary";"no";339;"yes";"no";"unknown";5;"may";2;3;-1;0;"unknown";"no"
+32;"blue-collar";"married";"secondary";"no";1331;"yes";"no";"unknown";5;"may";286;2;-1;0;"unknown";"no"
+40;"self-employed";"married";"secondary";"no";672;"yes";"no";"unknown";5;"may";164;2;-1;0;"unknown";"no"
+37;"management";"married";"tertiary";"no";58;"yes";"no";"unknown";5;"may";98;2;-1;0;"unknown";"no"
+54;"technician";"single";"unknown";"no";447;"yes";"no";"unknown";5;"may";742;2;-1;0;"unknown";"no"
+24;"student";"single";"secondary";"no";423;"yes";"no";"unknown";5;"may";226;3;-1;0;"unknown";"no"
+54;"management";"married";"tertiary";"no";0;"no";"no";"unknown";5;"may";120;2;-1;0;"unknown";"no"
+34;"technician";"married";"secondary";"no";19;"yes";"no";"unknown";5;"may";362;4;-1;0;"unknown";"no"
+56;"technician";"divorced";"primary";"no";13;"yes";"no";"unknown";5;"may";357;2;-1;0;"unknown";"no"
+31;"blue-collar";"single";"secondary";"no";3;"yes";"no";"unknown";5;"may";200;2;-1;0;"unknown";"no"
+24;"student";"single";"secondary";"no";82;"yes";"no";"unknown";5;"may";204;2;-1;0;"unknown";"no"
+42;"blue-collar";"divorced";"primary";"no";28;"yes";"no";"unknown";5;"may";126;3;-1;0;"unknown";"no"
+57;"technician";"married";"secondary";"no";792;"yes";"no";"unknown";5;"may";65;2;-1;0;"unknown";"no"
+42;"blue-collar";"married";"unknown";"no";408;"yes";"no";"unknown";5;"may";107;2;-1;0;"unknown";"no"
+51;"admin.";"married";"secondary";"no";531;"yes";"no";"unknown";5;"may";267;2;-1;0;"unknown";"no"
+57;"retired";"married";"secondary";"no";211;"yes";"no";"unknown";5;"may";248;2;-1;0;"unknown";"no"
+36;"services";"single";"secondary";"no";62;"yes";"no";"unknown";5;"may";215;2;-1;0;"unknown";"no"
+53;"services";"married";"unknown";"no";257;"yes";"no";"unknown";5;"may";209;2;-1;0;"unknown";"no"
+50;"technician";"married";"secondary";"no";1234;"yes";"no";"unknown";5;"may";205;2;-1;0;"unknown";"no"
+54;"management";"married";"tertiary";"no";313;"yes";"no";"unknown";5;"may";83;2;-1;0;"unknown";"no"
+55;"blue-collar";"married";"secondary";"no";89;"yes";"no";"unknown";5;"may";106;3;-1;0;"unknown";"no"
+44;"blue-collar";"married";"secondary";"no";129;"yes";"yes";"unknown";5;"may";189;2;-1;0;"unknown";"no"
+43;"management";"married";"unknown";"no";0;"yes";"no";"unknown";5;"may";105;2;-1;0;"unknown";"no"
+56;"admin.";"married";"secondary";"no";353;"yes";"no";"unknown";5;"may";106;2;-1;0;"unknown";"no"
+54;"technician";"married";"unknown";"no";851;"yes";"no";"unknown";5;"may";108;2;-1;0;"unknown";"no"
+55;"services";"divorced";"primary";"no";96;"yes";"yes";"unknown";5;"may";311;2;-1;0;"unknown";"no"
+37;"services";"divorced";"secondary";"no";398;"yes";"yes";"unknown";5;"may";214;2;-1;0;"unknown";"no"
+33;"admin.";"single";"tertiary";"no";193;"no";"no";"unknown";5;"may";132;2;-1;0;"unknown";"no"
+46;"admin.";"married";"secondary";"no";-358;"yes";"no";"unknown";5;"may";358;2;-1;0;"unknown";"no"
+36;"blue-collar";"married";"secondary";"no";539;"yes";"yes";"unknown";5;"may";453;2;-1;0;"unknown";"no"
+51;"technician";"single";"secondary";"no";0;"yes";"no";"unknown";5;"may";364;2;-1;0;"unknown";"no"
+40;"retired";"single";"primary";"no";0;"no";"no";"unknown";5;"may";136;2;-1;0;"unknown";"no"
+42;"blue-collar";"married";"secondary";"no";490;"yes";"no";"unknown";5;"may";386;2;-1;0;"unknown";"no"
+51;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";173;2;-1;0;"unknown";"no"
+49;"blue-collar";"married";"unknown";"no";403;"yes";"no";"unknown";5;"may";241;2;-1;0;"unknown";"no"
+48;"management";"married";"secondary";"no";161;"yes";"no";"unknown";5;"may";224;3;-1;0;"unknown";"no"
+32;"technician";"divorced";"tertiary";"no";2558;"no";"no";"unknown";5;"may";148;2;-1;0;"unknown";"no"
+31;"admin.";"single";"secondary";"no";98;"yes";"no";"unknown";5;"may";196;2;-1;0;"unknown";"no"
+55;"management";"single";"tertiary";"no";115;"no";"no";"unknown";5;"may";111;4;-1;0;"unknown";"no"
+40;"blue-collar";"single";"secondary";"no";436;"yes";"no";"unknown";5;"may";231;3;-1;0;"unknown";"no"
+47;"technician";"married";"tertiary";"no";831;"yes";"no";"unknown";5;"may";316;3;-1;0;"unknown";"no"
+57;"technician";"married";"unknown";"no";206;"yes";"no";"unknown";5;"may";216;3;-1;0;"unknown";"no"
+41;"blue-collar";"married";"secondary";"no";290;"yes";"no";"unknown";5;"may";240;2;-1;0;"unknown";"no"
+48;"blue-collar";"married";"secondary";"no";1;"no";"no";"unknown";5;"may";669;3;-1;0;"unknown";"no"
+42;"blue-collar";"married";"unknown";"no";57;"yes";"no";"unknown";5;"may";425;2;-1;0;"unknown";"no"
+30;"blue-collar";"single";"secondary";"no";-457;"yes";"no";"unknown";5;"may";143;2;-1;0;"unknown";"no"
+58;"management";"single";"tertiary";"no";1387;"yes";"no";"unknown";5;"may";174;5;-1;0;"unknown";"no"
+45;"management";"divorced";"tertiary";"no";24598;"yes";"no";"unknown";5;"may";313;3;-1;0;"unknown";"no"
+49;"blue-collar";"married";"secondary";"no";30;"yes";"no";"unknown";5;"may";135;4;-1;0;"unknown";"no"
+42;"admin.";"single";"secondary";"no";1022;"yes";"no";"unknown";5;"may";146;2;-1;0;"unknown";"no"
+53;"technician";"married";"secondary";"no";56;"yes";"yes";"unknown";5;"may";152;2;-1;0;"unknown";"no"
+51;"admin.";"single";"secondary";"yes";-2;"no";"no";"unknown";5;"may";402;3;-1;0;"unknown";"no"
+32;"services";"single";"secondary";"no";121;"yes";"no";"unknown";5;"may";213;2;-1;0;"unknown";"no"
+41;"blue-collar";"single";"secondary";"no";842;"yes";"no";"unknown";5;"may";144;3;-1;0;"unknown";"no"
+43;"management";"divorced";"secondary";"no";693;"yes";"no";"unknown";5;"may";124;3;-1;0;"unknown";"no"
+40;"blue-collar";"divorced";"secondary";"no";-333;"yes";"no";"unknown";5;"may";183;2;-1;0;"unknown";"no"
+50;"blue-collar";"married";"primary";"no";1533;"yes";"no";"unknown";5;"may";325;2;-1;0;"unknown";"no"
+34;"management";"married";"tertiary";"no";46;"yes";"no";"unknown";5;"may";39;4;-1;0;"unknown";"no"
+53;"services";"married";"unknown";"no";18;"no";"no";"unknown";5;"may";503;2;-1;0;"unknown";"no"
+45;"technician";"married";"secondary";"no";44;"yes";"no";"unknown";5;"may";95;4;-1;0;"unknown";"no"
+39;"blue-collar";"married";"primary";"no";-100;"yes";"no";"unknown";5;"may";680;2;-1;0;"unknown";"no"
+44;"services";"married";"tertiary";"no";510;"yes";"no";"unknown";5;"may";421;4;-1;0;"unknown";"no"
+55;"management";"married";"tertiary";"no";685;"yes";"no";"unknown";5;"may";174;3;-1;0;"unknown";"no"
+46;"management";"single";"tertiary";"no";187;"yes";"no";"unknown";5;"may";113;2;-1;0;"unknown";"no"
+45;"blue-collar";"married";"secondary";"no";66;"yes";"no";"unknown";5;"may";808;2;-1;0;"unknown";"no"
+34;"admin.";"married";"secondary";"no";560;"yes";"no";"unknown";5;"may";198;3;-1;0;"unknown";"no"
+36;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";195;2;-1;0;"unknown";"no"
+59;"unknown";"divorced";"unknown";"no";27;"no";"no";"unknown";5;"may";347;3;-1;0;"unknown";"no"
+31;"admin.";"single";"secondary";"no";12;"yes";"no";"unknown";5;"may";208;2;-1;0;"unknown";"no"
+44;"blue-collar";"single";"secondary";"no";34;"yes";"no";"unknown";5;"may";404;4;-1;0;"unknown";"no"
+33;"entrepreneur";"single";"tertiary";"no";1068;"yes";"no";"unknown";5;"may";396;2;-1;0;"unknown";"no"
+40;"blue-collar";"married";"secondary";"no";211;"yes";"no";"unknown";5;"may";216;4;-1;0;"unknown";"no"
+46;"admin.";"single";"tertiary";"no";377;"yes";"no";"unknown";5;"may";98;2;-1;0;"unknown";"no"
+48;"management";"married";"tertiary";"no";263;"yes";"no";"unknown";5;"may";350;2;-1;0;"unknown";"no"
+42;"services";"married";"secondary";"no";1263;"yes";"no";"unknown";6;"may";114;2;-1;0;"unknown";"no"
+27;"services";"married";"secondary";"no";8;"yes";"no";"unknown";6;"may";88;3;-1;0;"unknown";"no"
+48;"admin.";"married";"secondary";"no";126;"yes";"yes";"unknown";6;"may";379;2;-1;0;"unknown";"no"
+59;"admin.";"married";"secondary";"no";230;"yes";"no";"unknown";6;"may";190;3;-1;0;"unknown";"no"
+46;"technician";"married";"tertiary";"no";841;"yes";"no";"unknown";6;"may";158;1;-1;0;"unknown";"no"
+38;"admin.";"divorced";"secondary";"no";308;"yes";"no";"unknown";6;"may";102;1;-1;0;"unknown";"no"
+43;"management";"divorced";"tertiary";"no";1;"yes";"no";"unknown";6;"may";306;1;-1;0;"unknown";"no"
+38;"admin.";"divorced";"tertiary";"no";86;"yes";"no";"unknown";6;"may";218;1;-1;0;"unknown";"no"
+23;"student";"single";"secondary";"no";157;"yes";"no";"unknown";6;"may";54;1;-1;0;"unknown";"no"
+34;"services";"married";"secondary";"no";22;"yes";"no";"unknown";6;"may";344;1;-1;0;"unknown";"no"
+38;"admin.";"married";"secondary";"no";46;"yes";"yes";"unknown";6;"may";195;1;-1;0;"unknown";"no"
+37;"blue-collar";"married";"secondary";"no";1293;"no";"no";"unknown";6;"may";652;1;-1;0;"unknown";"no"
+25;"admin.";"single";"secondary";"no";122;"yes";"no";"unknown";6;"may";286;1;-1;0;"unknown";"no"
+48;"blue-collar";"married";"unknown";"no";131;"yes";"no";"unknown";6;"may";189;1;-1;0;"unknown";"no"
+49;"blue-collar";"single";"secondary";"no";143;"yes";"no";"unknown";6;"may";83;1;-1;0;"unknown";"no"
+38;"admin.";"single";"secondary";"no";393;"no";"no";"unknown";6;"may";184;2;-1;0;"unknown";"no"
+43;"blue-collar";"married";"primary";"no";98;"yes";"no";"unknown";6;"may";235;1;-1;0;"unknown";"no"
+33;"management";"married";"tertiary";"no";0;"yes";"no";"unknown";6;"may";290;1;-1;0;"unknown";"no"
+55;"blue-collar";"married";"secondary";"no";224;"yes";"no";"unknown";6;"may";232;1;-1;0;"unknown";"no"
+38;"blue-collar";"married";"secondary";"no";757;"yes";"no";"unknown";6;"may";133;1;-1;0;"unknown";"no"
+49;"services";"married";"secondary";"no";245;"yes";"yes";"unknown";6;"may";318;1;-1;0;"unknown";"no"
+40;"management";"married";"secondary";"no";8486;"no";"no";"unknown";6;"may";260;3;-1;0;"unknown";"no"
+43;"admin.";"married";"unknown";"no";350;"no";"no";"unknown";6;"may";437;1;-1;0;"unknown";"no"
+38;"blue-collar";"married";"secondary";"no";20;"yes";"no";"unknown";6;"may";402;1;-1;0;"unknown";"no"
+58;"services";"married";"secondary";"no";1667;"yes";"yes";"unknown";6;"may";85;1;-1;0;"unknown";"no"
+57;"technician";"married";"unknown";"no";345;"yes";"no";"unknown";6;"may";125;1;-1;0;"unknown";"no"
+32;"unemployed";"married";"secondary";"no";10;"yes";"no";"unknown";6;"may";501;4;-1;0;"unknown";"no"
+56;"management";"married";"tertiary";"no";830;"yes";"yes";"unknown";6;"may";1201;1;-1;0;"unknown";"yes"
+58;"blue-collar";"divorced";"unknown";"no";29;"yes";"no";"unknown";6;"may";253;1;-1;0;"unknown";"no"
+60;"retired";"divorced";"secondary";"no";545;"yes";"no";"unknown";6;"may";1030;1;-1;0;"unknown";"yes"
+37;"technician";"married";"tertiary";"no";8730;"yes";"no";"unknown";6;"may";149;1;-1;0;"unknown";"no"
+46;"technician";"divorced";"tertiary";"no";477;"yes";"no";"unknown";6;"may";114;1;-1;0;"unknown";"no"
+27;"admin.";"married";"secondary";"no";4;"yes";"no";"unknown";6;"may";243;1;-1;0;"unknown";"no"
+43;"blue-collar";"married";"secondary";"no";1205;"yes";"no";"unknown";6;"may";769;2;-1;0;"unknown";"no"
+32;"technician";"single";"secondary";"no";0;"yes";"yes";"unknown";6;"may";135;3;-1;0;"unknown";"no"
+40;"admin.";"single";"secondary";"no";263;"yes";"no";"unknown";6;"may";231;1;-1;0;"unknown";"no"
+38;"admin.";"married";"secondary";"no";1;"no";"no";"unknown";6;"may";76;1;-1;0;"unknown";"no"
+34;"blue-collar";"married";"secondary";"no";283;"no";"yes";"unknown";6;"may";199;1;-1;0;"unknown";"no"
+47;"blue-collar";"married";"primary";"no";206;"yes";"no";"unknown";6;"may";152;1;-1;0;"unknown";"no"
+42;"housemaid";"married";"primary";"no";17;"yes";"no";"unknown";6;"may";124;1;-1;0;"unknown";"no"
+48;"technician";"married";"secondary";"no";141;"yes";"yes";"unknown";6;"may";424;1;-1;0;"unknown";"no"
+29;"self-employed";"single";"tertiary";"no";16;"yes";"no";"unknown";6;"may";43;1;-1;0;"unknown";"no"
+50;"services";"married";"secondary";"no";206;"yes";"no";"unknown";6;"may";154;1;-1;0;"unknown";"no"
+52;"technician";"married";"unknown";"no";7;"no";"no";"unknown";6;"may";203;2;-1;0;"unknown";"no"
+50;"management";"married";"tertiary";"no";0;"no";"no";"unknown";6;"may";326;1;-1;0;"unknown";"no"
+58;"retired";"married";"tertiary";"no";0;"no";"no";"unknown";6;"may";393;1;-1;0;"unknown";"no"
+46;"blue-collar";"divorced";"primary";"no";1927;"yes";"no";"unknown";6;"may";241;3;-1;0;"unknown";"no"
+38;"technician";"married";"secondary";"no";284;"yes";"no";"unknown";6;"may";483;1;-1;0;"unknown";"no"
+46;"blue-collar";"married";"secondary";"no";1660;"yes";"no";"unknown";6;"may";259;1;-1;0;"unknown";"no"
+32;"services";"single";"secondary";"no";406;"yes";"no";"unknown";6;"may";227;1;-1;0;"unknown";"no"
+51;"blue-collar";"married";"primary";"no";230;"yes";"no";"unknown";6;"may";673;1;-1;0;"unknown";"no"
+39;"admin.";"single";"secondary";"no";-25;"yes";"no";"unknown";6;"may";576;1;-1;0;"unknown";"no"
+48;"admin.";"married";"secondary";"no";182;"yes";"no";"unknown";6;"may";180;2;-1;0;"unknown";"no"
+36;"entrepreneur";"married";"tertiary";"no";1169;"yes";"no";"unknown";6;"may";168;2;-1;0;"unknown";"no"
+34;"admin.";"divorced";"secondary";"no";67;"yes";"no";"unknown";6;"may";90;1;-1;0;"unknown";"no"
+40;"technician";"married";"secondary";"no";77;"no";"no";"unknown";6;"may";505;1;-1;0;"unknown";"no"
+43;"unemployed";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";245;1;-1;0;"unknown";"no"
+52;"blue-collar";"divorced";"primary";"no";55;"yes";"yes";"unknown";6;"may";186;1;-1;0;"unknown";"no"
+33;"technician";"married";"secondary";"yes";72;"yes";"no";"unknown";6;"may";623;1;-1;0;"unknown";"no"
+49;"management";"single";"tertiary";"no";163;"yes";"no";"unknown";6;"may";496;3;-1;0;"unknown";"no"
+32;"management";"single";"tertiary";"no";151;"yes";"no";"unknown";6;"may";118;1;-1;0;"unknown";"no"
+39;"admin.";"single";"secondary";"no";113;"yes";"no";"unknown";6;"may";342;1;-1;0;"unknown";"no"
+40;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";225;1;-1;0;"unknown";"no"
+38;"technician";"single";"tertiary";"no";9;"yes";"no";"unknown";6;"may";185;3;-1;0;"unknown";"no"
+43;"management";"married";"secondary";"no";375;"yes";"no";"unknown";6;"may";196;2;-1;0;"unknown";"no"
+39;"services";"married";"secondary";"no";1142;"yes";"no";"unknown";6;"may";276;1;-1;0;"unknown";"no"
+54;"blue-collar";"married";"primary";"no";2102;"yes";"no";"unknown";6;"may";76;1;-1;0;"unknown";"no"
+38;"technician";"single";"tertiary";"no";4325;"yes";"no";"unknown";6;"may";87;1;-1;0;"unknown";"no"
+40;"blue-collar";"married";"secondary";"no";217;"yes";"no";"unknown";6;"may";195;1;-1;0;"unknown";"no"
+55;"admin.";"married";"secondary";"no";131;"yes";"no";"unknown";6;"may";744;1;-1;0;"unknown";"no"
+42;"management";"married";"tertiary";"no";1680;"yes";"no";"unknown";6;"may";765;1;-1;0;"unknown";"no"
+39;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";262;1;-1;0;"unknown";"no"
+40;"blue-collar";"married";"primary";"no";46;"yes";"no";"unknown";6;"may";119;1;-1;0;"unknown";"no"
+32;"blue-collar";"married";"secondary";"no";320;"yes";"no";"unknown";6;"may";198;2;-1;0;"unknown";"no"
+55;"admin.";"married";"secondary";"no";183;"yes";"no";"unknown";6;"may";150;1;-1;0;"unknown";"no"
+45;"blue-collar";"married";"secondary";"no";39;"no";"no";"unknown";6;"may";241;1;-1;0;"unknown";"no"
+35;"management";"single";"tertiary";"no";560;"yes";"no";"unknown";6;"may";181;1;-1;0;"unknown";"no"
+58;"technician";"divorced";"secondary";"no";469;"no";"no";"unknown";6;"may";196;1;-1;0;"unknown";"no"
+35;"admin.";"married";"secondary";"no";530;"yes";"no";"unknown";6;"may";149;1;-1;0;"unknown";"no"
+49;"services";"married";"primary";"no";61;"yes";"no";"unknown";6;"may";246;1;-1;0;"unknown";"no"
+34;"technician";"single";"tertiary";"no";242;"yes";"no";"unknown";6;"may";112;1;-1;0;"unknown";"no"
+36;"blue-collar";"married";"secondary";"no";139;"yes";"no";"unknown";6;"may";309;2;-1;0;"unknown";"no"
+24;"self-employed";"single";"secondary";"no";0;"yes";"no";"unknown";6;"may";278;1;-1;0;"unknown";"no"
+34;"technician";"married";"secondary";"no";367;"yes";"no";"unknown";6;"may";140;1;-1;0;"unknown";"no"
+51;"admin.";"divorced";"secondary";"no";228;"yes";"no";"unknown";6;"may";136;1;-1;0;"unknown";"no"
+39;"technician";"single";"unknown";"no";45248;"yes";"no";"unknown";6;"may";1623;1;-1;0;"unknown";"yes"
+50;"self-employed";"married";"unknown";"no";-84;"yes";"no";"unknown";6;"may";101;1;-1;0;"unknown";"no"
+32;"services";"single";"secondary";"no";310;"yes";"no";"unknown";6;"may";144;1;-1;0;"unknown";"no"
+42;"blue-collar";"married";"unknown";"no";132;"yes";"no";"unknown";6;"may";238;1;-1;0;"unknown";"no"
+50;"technician";"married";"secondary";"no";797;"yes";"no";"unknown";6;"may";354;1;-1;0;"unknown";"no"
+40;"services";"married";"secondary";"no";71;"no";"no";"unknown";6;"may";79;1;-1;0;"unknown";"no"
+46;"management";"divorced";"unknown";"no";2;"yes";"no";"unknown";6;"may";187;1;-1;0;"unknown";"no"
+37;"management";"married";"tertiary";"no";231;"yes";"yes";"unknown";6;"may";451;2;-1;0;"unknown";"no"
+34;"blue-collar";"married";"secondary";"no";270;"yes";"yes";"unknown";6;"may";159;1;-1;0;"unknown";"no"
+44;"blue-collar";"married";"secondary";"no";274;"yes";"yes";"unknown";6;"may";409;1;-1;0;"unknown";"no"
+40;"admin.";"single";"secondary";"no";-109;"yes";"yes";"unknown";6;"may";170;1;-1;0;"unknown";"no"
+37;"technician";"married";"secondary";"no";1;"yes";"no";"unknown";6;"may";608;1;-1;0;"unknown";"yes"
+33;"blue-collar";"single";"secondary";"yes";-60;"no";"no";"unknown";6;"may";243;1;-1;0;"unknown";"no"
+35;"blue-collar";"married";"secondary";"no";89;"yes";"no";"unknown";6;"may";145;2;-1;0;"unknown";"no"
+58;"blue-collar";"divorced";"secondary";"no";-11;"no";"no";"unknown";6;"may";112;1;-1;0;"unknown";"no"
+39;"blue-collar";"married";"primary";"no";0;"yes";"no";"unknown";6;"may";262;2;-1;0;"unknown";"no"
+43;"blue-collar";"married";"secondary";"no";-509;"yes";"no";"unknown";6;"may";124;1;-1;0;"unknown";"no"
+39;"unemployed";"married";"primary";"no";408;"yes";"no";"unknown";6;"may";53;1;-1;0;"unknown";"no"
+36;"services";"single";"primary";"no";58;"yes";"no";"unknown";6;"may";134;1;-1;0;"unknown";"no"
+57;"retired";"single";"secondary";"no";1640;"no";"yes";"unknown";6;"may";204;4;-1;0;"unknown";"no"
+36;"admin.";"single";"secondary";"no";20;"yes";"no";"unknown";6;"may";186;1;-1;0;"unknown";"no"
+50;"blue-collar";"married";"primary";"no";71;"yes";"no";"unknown";6;"may";678;1;-1;0;"unknown";"no"
+55;"blue-collar";"married";"secondary";"no";52;"yes";"no";"unknown";6;"may";182;1;-1;0;"unknown";"no"
+44;"self-employed";"married";"tertiary";"no";292;"yes";"no";"unknown";6;"may";162;1;-1;0;"unknown";"no"
+44;"services";"divorced";"secondary";"no";424;"yes";"no";"unknown";6;"may";27;1;-1;0;"unknown";"no"
+39;"housemaid";"single";"primary";"no";109;"yes";"no";"unknown";6;"may";699;3;-1;0;"unknown";"no"
+46;"blue-collar";"married";"unknown";"no";1044;"yes";"no";"unknown";6;"may";43;1;-1;0;"unknown";"no"
+39;"blue-collar";"married";"secondary";"no";983;"yes";"no";"unknown";6;"may";97;1;-1;0;"unknown";"no"
+34;"admin.";"married";"secondary";"no";869;"no";"no";"unknown";6;"may";1677;1;-1;0;"unknown";"yes"
+40;"blue-collar";"married";"primary";"no";668;"yes";"no";"unknown";6;"may";283;2;-1;0;"unknown";"no"
+50;"management";"married";"tertiary";"no";964;"yes";"no";"unknown";6;"may";323;1;-1;0;"unknown";"no"
+31;"management";"single";"secondary";"no";301;"yes";"no";"unknown";6;"may";82;1;-1;0;"unknown";"no"
+37;"admin.";"single";"secondary";"no";140;"yes";"no";"unknown";6;"may";310;1;-1;0;"unknown";"no"
+39;"management";"single";"secondary";"no";1877;"yes";"no";"unknown";6;"may";185;1;-1;0;"unknown";"no"
+51;"blue-collar";"married";"primary";"no";1127;"yes";"no";"unknown";6;"may";47;1;-1;0;"unknown";"no"
+41;"technician";"married";"secondary";"no";871;"yes";"no";"unknown";6;"may";145;1;-1;0;"unknown";"no"
+41;"technician";"married";"secondary";"no";767;"yes";"yes";"unknown";6;"may";204;1;-1;0;"unknown";"no"
+43;"blue-collar";"married";"secondary";"no";0;"no";"no";"unknown";6;"may";187;1;-1;0;"unknown";"no"
+30;"services";"single";"secondary";"no";209;"yes";"no";"unknown";6;"may";30;2;-1;0;"unknown";"no"
+54;"management";"divorced";"primary";"no";0;"no";"no";"unknown";6;"may";472;1;-1;0;"unknown";"no"
+43;"blue-collar";"divorced";"secondary";"no";110;"yes";"yes";"unknown";6;"may";448;1;-1;0;"unknown";"no"
+59;"management";"divorced";"tertiary";"no";-76;"yes";"yes";"unknown";6;"may";264;1;-1;0;"unknown";"no"
+47;"technician";"married";"unknown";"no";178;"yes";"no";"unknown";6;"may";169;1;-1;0;"unknown";"no"
+40;"blue-collar";"married";"primary";"no";-66;"yes";"no";"unknown";6;"may";288;1;-1;0;"unknown";"no"
+32;"technician";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";176;2;-1;0;"unknown";"no"
+29;"blue-collar";"married";"secondary";"no";1;"yes";"no";"unknown";6;"may";215;1;-1;0;"unknown";"no"
+36;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";337;1;-1;0;"unknown";"no"
+55;"unemployed";"married";"tertiary";"no";5345;"no";"no";"unknown";6;"may";278;1;-1;0;"unknown";"no"
+30;"blue-collar";"divorced";"secondary";"no";-209;"yes";"no";"unknown";6;"may";188;2;-1;0;"unknown";"no"
+39;"admin.";"single";"secondary";"no";0;"yes";"no";"unknown";6;"may";174;2;-1;0;"unknown";"no"
+39;"blue-collar";"divorced";"secondary";"no";42;"yes";"no";"unknown";6;"may";226;2;-1;0;"unknown";"no"
+50;"blue-collar";"divorced";"secondary";"no";41;"yes";"no";"unknown";6;"may";190;1;-1;0;"unknown";"no"
+44;"blue-collar";"married";"secondary";"no";-99;"yes";"no";"unknown";6;"may";111;2;-1;0;"unknown";"no"
+37;"technician";"single";"secondary";"no";17;"yes";"no";"unknown";6;"may";164;1;-1;0;"unknown";"no"
+46;"admin.";"married";"primary";"no";276;"yes";"yes";"unknown";6;"may";157;2;-1;0;"unknown";"no"
+32;"technician";"single";"unknown";"no";-170;"no";"no";"unknown";6;"may";46;1;-1;0;"unknown";"no"
+37;"management";"single";"tertiary";"no";230;"yes";"yes";"unknown";6;"may";374;1;-1;0;"unknown";"no"
+29;"blue-collar";"married";"secondary";"no";9;"yes";"no";"unknown";6;"may";349;1;-1;0;"unknown";"no"
+41;"blue-collar";"married";"secondary";"no";946;"yes";"no";"unknown";6;"may";325;1;-1;0;"unknown";"no"
+45;"blue-collar";"married";"primary";"no";1297;"yes";"no";"unknown";6;"may";233;2;-1;0;"unknown";"no"
+57;"retired";"divorced";"secondary";"no";-331;"yes";"no";"unknown";6;"may";531;1;-1;0;"unknown";"no"
+48;"blue-collar";"single";"secondary";"no";44;"yes";"no";"unknown";6;"may";153;1;-1;0;"unknown";"no"
+60;"retired";"married";"secondary";"yes";15;"no";"no";"unknown";6;"may";80;1;-1;0;"unknown";"no"
+26;"admin.";"single";"secondary";"no";712;"yes";"no";"unknown";6;"may";232;1;-1;0;"unknown";"no"
+58;"retired";"married";"secondary";"no";5435;"yes";"no";"unknown";6;"may";118;1;-1;0;"unknown";"no"
+34;"admin.";"married";"secondary";"no";507;"yes";"no";"unknown";6;"may";190;1;-1;0;"unknown";"no"
+55;"unemployed";"divorced";"secondary";"no";387;"yes";"no";"unknown";6;"may";918;1;-1;0;"unknown";"yes"
+41;"blue-collar";"married";"primary";"no";0;"yes";"yes";"unknown";6;"may";238;1;-1;0;"unknown";"no"
+50;"management";"divorced";"secondary";"no";1716;"yes";"no";"unknown";6;"may";82;1;-1;0;"unknown";"no"
+49;"entrepreneur";"married";"secondary";"no";167;"yes";"yes";"unknown";6;"may";198;3;-1;0;"unknown";"no"
+44;"admin.";"married";"unknown";"no";40;"no";"yes";"unknown";6;"may";160;2;-1;0;"unknown";"no"
+44;"blue-collar";"married";"primary";"no";148;"yes";"no";"unknown";6;"may";211;1;-1;0;"unknown";"no"
+31;"technician";"married";"secondary";"no";17;"yes";"yes";"unknown";6;"may";120;1;-1;0;"unknown";"no"
+34;"blue-collar";"single";"tertiary";"no";1011;"yes";"no";"unknown";6;"may";136;1;-1;0;"unknown";"no"
+46;"management";"single";"unknown";"no";1527;"yes";"no";"unknown";6;"may";269;1;-1;0;"unknown";"no"
+42;"management";"married";"tertiary";"no";744;"no";"no";"unknown";6;"may";157;1;-1;0;"unknown";"no"
+52;"admin.";"married";"secondary";"no";484;"yes";"no";"unknown";6;"may";128;1;-1;0;"unknown";"no"
+29;"management";"single";"tertiary";"no";0;"yes";"no";"unknown";6;"may";211;1;-1;0;"unknown";"no"
+53;"retired";"married";"primary";"no";136;"yes";"no";"unknown";6;"may";267;2;-1;0;"unknown";"no"
+43;"blue-collar";"married";"secondary";"no";1335;"yes";"no";"unknown";6;"may";371;2;-1;0;"unknown";"no"
+38;"management";"married";"secondary";"no";517;"yes";"no";"unknown";6;"may";288;2;-1;0;"unknown";"no"
+46;"management";"married";"tertiary";"no";459;"yes";"no";"unknown";6;"may";221;1;-1;0;"unknown";"no"
+48;"management";"divorced";"unknown";"no";549;"yes";"no";"unknown";6;"may";427;1;-1;0;"unknown";"no"
+30;"admin.";"divorced";"secondary";"no";83;"yes";"yes";"unknown";6;"may";310;1;-1;0;"unknown";"no"
+44;"blue-collar";"married";"primary";"no";213;"no";"no";"unknown";6;"may";158;1;-1;0;"unknown";"no"
+31;"housemaid";"married";"primary";"no";203;"yes";"no";"unknown";6;"may";604;3;-1;0;"unknown";"no"
+42;"services";"single";"secondary";"no";518;"yes";"no";"unknown";6;"may";198;1;-1;0;"unknown";"no"
+40;"management";"single";"tertiary";"no";3877;"yes";"no";"unknown";6;"may";145;1;-1;0;"unknown";"no"
+52;"admin.";"married";"secondary";"no";1236;"yes";"no";"unknown";6;"may";247;1;-1;0;"unknown";"no"
+45;"blue-collar";"divorced";"secondary";"no";756;"yes";"no";"unknown";6;"may";179;2;-1;0;"unknown";"no"
+48;"blue-collar";"married";"secondary";"no";157;"yes";"no";"unknown";6;"may";73;1;-1;0;"unknown";"no"
+45;"blue-collar";"married";"primary";"no";-66;"yes";"no";"unknown";6;"may";263;2;-1;0;"unknown";"no"
+34;"blue-collar";"married";"unknown";"no";245;"yes";"no";"unknown";6;"may";13;1;-1;0;"unknown";"no"
+34;"blue-collar";"married";"primary";"no";-144;"yes";"no";"unknown";6;"may";79;2;-1;0;"unknown";"no"
+46;"blue-collar";"married";"secondary";"no";71;"yes";"no";"unknown";6;"may";416;1;-1;0;"unknown";"no"
+49;"services";"divorced";"secondary";"no";505;"yes";"no";"unknown";6;"may";162;1;-1;0;"unknown";"no"
+50;"technician";"married";"primary";"no";249;"yes";"no";"unknown";6;"may";129;1;-1;0;"unknown";"no"
+34;"admin.";"single";"secondary";"no";0;"yes";"no";"unknown";6;"may";150;1;-1;0;"unknown";"no"
+40;"unemployed";"single";"secondary";"no";11;"yes";"no";"unknown";6;"may";43;1;-1;0;"unknown";"no"
+36;"admin.";"married";"secondary";"no";639;"yes";"no";"unknown";6;"may";191;1;-1;0;"unknown";"no"
+59;"blue-collar";"divorced";"unknown";"no";124;"yes";"no";"unknown";6;"may";26;1;-1;0;"unknown";"no"
+45;"blue-collar";"married";"secondary";"no";82;"yes";"no";"unknown";6;"may";250;1;-1;0;"unknown";"no"
+36;"self-employed";"married";"tertiary";"no";107;"yes";"no";"unknown";6;"may";146;1;-1;0;"unknown";"no"
+56;"services";"married";"secondary";"no";473;"yes";"no";"unknown";6;"may";416;1;-1;0;"unknown";"no"
+42;"services";"divorced";"secondary";"no";372;"yes";"yes";"unknown";6;"may";121;2;-1;0;"unknown";"no"
+30;"admin.";"married";"secondary";"no";46;"yes";"no";"unknown";6;"may";114;2;-1;0;"unknown";"no"
+30;"student";"single";"tertiary";"no";34;"yes";"no";"unknown";6;"may";289;1;-1;0;"unknown";"no"
+47;"self-employed";"married";"unknown";"no";935;"yes";"no";"unknown";6;"may";225;1;-1;0;"unknown";"no"
+33;"blue-collar";"married";"secondary";"no";-10;"yes";"no";"unknown";6;"may";123;1;-1;0;"unknown";"no"
+36;"admin.";"married";"secondary";"no";-106;"yes";"no";"unknown";6;"may";130;2;-1;0;"unknown";"no"
+39;"services";"divorced";"primary";"no";471;"yes";"no";"unknown";6;"may";161;2;-1;0;"unknown";"no"
+56;"admin.";"divorced";"secondary";"no";778;"yes";"no";"unknown";6;"may";149;2;-1;0;"unknown";"no"
+39;"blue-collar";"divorced";"unknown";"no";170;"yes";"no";"unknown";6;"may";268;2;-1;0;"unknown";"no"
+42;"technician";"married";"secondary";"no";315;"yes";"no";"unknown";6;"may";259;2;-1;0;"unknown";"no"
+52;"blue-collar";"married";"secondary";"no";3165;"no";"no";"unknown";6;"may";26;1;-1;0;"unknown";"no"
+36;"admin.";"divorced";"secondary";"no";131;"yes";"no";"unknown";6;"may";153;1;-1;0;"unknown";"no"
+35;"entrepreneur";"married";"secondary";"yes";204;"yes";"no";"unknown";6;"may";424;2;-1;0;"unknown";"no"
+47;"technician";"married";"secondary";"no";83;"yes";"no";"unknown";6;"may";179;2;-1;0;"unknown";"no"
+59;"services";"divorced";"secondary";"no";0;"yes";"yes";"unknown";6;"may";97;1;-1;0;"unknown";"no"
+57;"blue-collar";"married";"primary";"no";5431;"yes";"yes";"unknown";6;"may";383;1;-1;0;"unknown";"no"
+38;"management";"married";"unknown";"no";1759;"yes";"no";"unknown";6;"may";440;1;-1;0;"unknown";"no"
+46;"unemployed";"married";"secondary";"no";-125;"yes";"no";"unknown";6;"may";23;1;-1;0;"unknown";"no"
+34;"blue-collar";"married";"secondary";"no";0;"no";"no";"unknown";6;"may";195;1;-1;0;"unknown";"no"
+28;"services";"single";"secondary";"no";5090;"yes";"no";"unknown";6;"may";1297;3;-1;0;"unknown";"yes"
+38;"technician";"married";"unknown";"no";573;"yes";"no";"unknown";6;"may";87;1;-1;0;"unknown";"no"
+56;"blue-collar";"married";"secondary";"no";1602;"yes";"no";"unknown";6;"may";427;1;-1;0;"unknown";"no"
+41;"blue-collar";"single";"primary";"yes";-137;"yes";"yes";"unknown";6;"may";189;1;-1;0;"unknown";"no"
+52;"technician";"married";"unknown";"no";0;"no";"no";"unknown";6;"may";195;1;-1;0;"unknown";"no"
+54;"services";"married";"secondary";"no";193;"no";"no";"unknown";6;"may";179;1;-1;0;"unknown";"no"
+61;"retired";"married";"secondary";"no";195;"yes";"yes";"unknown";6;"may";179;1;-1;0;"unknown";"no"
+53;"entrepreneur";"married";"secondary";"no";288;"no";"no";"unknown";6;"may";69;1;-1;0;"unknown";"no"
+47;"technician";"married";"secondary";"no";19;"yes";"no";"unknown";6;"may";105;2;-1;0;"unknown";"no"
+53;"blue-collar";"married";"primary";"no";25;"yes";"no";"unknown";6;"may";266;3;-1;0;"unknown";"no"
+46;"services";"married";"secondary";"no";216;"yes";"no";"unknown";6;"may";524;2;-1;0;"unknown";"no"
+39;"blue-collar";"divorced";"primary";"no";190;"yes";"yes";"unknown";6;"may";96;2;-1;0;"unknown";"no"
+56;"technician";"divorced";"secondary";"no";99;"yes";"no";"unknown";6;"may";155;2;-1;0;"unknown";"no"
+55;"services";"divorced";"primary";"no";2298;"yes";"no";"unknown";6;"may";162;2;-1;0;"unknown";"no"
+44;"management";"married";"tertiary";"no";17;"yes";"no";"unknown";6;"may";352;2;-1;0;"unknown";"no"
+37;"technician";"married";"primary";"no";0;"yes";"no";"unknown";6;"may";76;4;-1;0;"unknown";"no"
+35;"blue-collar";"married";"primary";"no";0;"yes";"no";"unknown";6;"may";154;2;-1;0;"unknown";"no"
+55;"blue-collar";"married";"secondary";"no";840;"yes";"no";"unknown";6;"may";310;2;-1;0;"unknown";"no"
+37;"services";"married";"secondary";"no";358;"yes";"no";"unknown";6;"may";390;3;-1;0;"unknown";"no"
+30;"technician";"single";"secondary";"no";0;"yes";"no";"unknown";6;"may";369;1;-1;0;"unknown";"no"
+37;"blue-collar";"married";"primary";"no";-325;"yes";"yes";"unknown";6;"may";112;2;-1;0;"unknown";"no"
+36;"technician";"single";"secondary";"no";-15;"yes";"no";"unknown";6;"may";341;3;-1;0;"unknown";"no"
+38;"technician";"married";"secondary";"no";581;"yes";"no";"unknown";6;"may";79;1;-1;0;"unknown";"no"
+41;"admin.";"divorced";"primary";"no";4070;"yes";"no";"unknown";6;"may";140;2;-1;0;"unknown";"no"
+48;"retired";"married";"secondary";"no";74;"no";"yes";"unknown";6;"may";315;1;-1;0;"unknown";"no"
+55;"services";"divorced";"secondary";"no";141;"yes";"no";"unknown";6;"may";262;2;-1;0;"unknown";"no"
+28;"services";"divorced";"secondary";"no";89;"no";"no";"unknown";6;"may";174;2;-1;0;"unknown";"no"
+54;"services";"married";"secondary";"yes";0;"yes";"no";"unknown";6;"may";138;3;-1;0;"unknown";"no"
+30;"blue-collar";"married";"secondary";"no";450;"no";"no";"unknown";6;"may";526;2;-1;0;"unknown";"no"
+48;"technician";"married";"tertiary";"no";310;"no";"no";"unknown";6;"may";135;1;-1;0;"unknown";"no"
+31;"self-employed";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";36;5;-1;0;"unknown";"no"
+38;"blue-collar";"married";"secondary";"no";384;"yes";"no";"unknown";6;"may";1906;3;-1;0;"unknown";"no"
+37;"blue-collar";"married";"secondary";"no";395;"yes";"no";"unknown";6;"may";219;2;-1;0;"unknown";"no"
+37;"services";"single";"unknown";"no";-118;"yes";"no";"unknown";6;"may";147;2;-1;0;"unknown";"no"
+56;"blue-collar";"married";"primary";"no";5;"yes";"yes";"unknown";6;"may";407;2;-1;0;"unknown";"no"
+51;"blue-collar";"married";"secondary";"no";50;"yes";"yes";"unknown";6;"may";121;1;-1;0;"unknown";"no"
+39;"blue-collar";"married";"secondary";"no";285;"yes";"yes";"unknown";6;"may";209;1;-1;0;"unknown";"no"
+49;"technician";"married";"unknown";"no";15;"no";"no";"unknown";6;"may";92;2;-1;0;"unknown";"no"
+51;"blue-collar";"married";"primary";"no";653;"yes";"yes";"unknown";6;"may";208;1;-1;0;"unknown";"no"
+43;"self-employed";"married";"secondary";"no";918;"yes";"no";"unknown";6;"may";193;1;-1;0;"unknown";"no"
+32;"services";"married";"secondary";"no";243;"yes";"yes";"unknown";6;"may";144;1;-1;0;"unknown";"no"
+29;"technician";"single";"tertiary";"no";405;"yes";"no";"unknown";6;"may";65;1;-1;0;"unknown";"no"
+48;"management";"divorced";"tertiary";"no";1328;"yes";"no";"unknown";6;"may";339;1;-1;0;"unknown";"no"
+55;"services";"married";"primary";"no";255;"yes";"no";"unknown";6;"may";285;1;-1;0;"unknown";"no"
+53;"blue-collar";"married";"secondary";"no";3397;"yes";"no";"unknown";6;"may";231;1;-1;0;"unknown";"no"
+47;"technician";"married";"unknown";"no";2106;"yes";"no";"unknown";6;"may";168;1;-1;0;"unknown";"no"
+39;"management";"married";"tertiary";"no";2877;"yes";"no";"unknown";6;"may";278;1;-1;0;"unknown";"no"
+31;"blue-collar";"single";"tertiary";"no";60;"yes";"yes";"unknown";6;"may";389;1;-1;0;"unknown";"no"
+39;"blue-collar";"married";"primary";"no";2226;"yes";"no";"unknown";6;"may";158;1;-1;0;"unknown";"no"
+40;"blue-collar";"married";"primary";"no";2880;"yes";"no";"unknown";6;"may";145;2;-1;0;"unknown";"no"
+40;"technician";"single";"unknown";"no";-5;"yes";"no";"unknown";6;"may";78;2;-1;0;"unknown";"no"
+48;"technician";"married";"secondary";"no";147;"no";"no";"unknown";6;"may";142;3;-1;0;"unknown";"no"
+33;"technician";"divorced";"secondary";"no";7;"yes";"yes";"unknown";6;"may";87;1;-1;0;"unknown";"no"
+40;"technician";"married";"secondary";"no";109;"yes";"no";"unknown";6;"may";147;2;-1;0;"unknown";"no"
+59;"retired";"married";"primary";"no";-119;"yes";"no";"unknown";6;"may";289;1;-1;0;"unknown";"no"
+30;"technician";"married";"secondary";"no";484;"yes";"no";"unknown";6;"may";703;1;-1;0;"unknown";"yes"
+31;"management";"single";"tertiary";"no";1852;"yes";"no";"unknown";6;"may";170;3;-1;0;"unknown";"no"
+35;"unemployed";"married";"secondary";"no";533;"yes";"no";"unknown";6;"may";802;1;-1;0;"unknown";"no"
+54;"technician";"divorced";"secondary";"no";21;"yes";"no";"unknown";6;"may";381;2;-1;0;"unknown";"no"
+34;"admin.";"single";"unknown";"no";2434;"yes";"no";"unknown";6;"may";218;4;-1;0;"unknown";"no"
+32;"technician";"married";"secondary";"no";90;"yes";"yes";"unknown";6;"may";57;2;-1;0;"unknown";"no"
+56;"admin.";"divorced";"unknown";"no";4246;"yes";"no";"unknown";6;"may";304;2;-1;0;"unknown";"no"
+32;"admin.";"single";"tertiary";"no";395;"yes";"no";"unknown";6;"may";241;3;-1;0;"unknown";"no"
+42;"blue-collar";"married";"primary";"no";15;"yes";"no";"unknown";6;"may";230;1;-1;0;"unknown";"no"
+33;"services";"married";"tertiary";"no";85;"no";"no";"unknown";6;"may";262;3;-1;0;"unknown";"no"
+52;"entrepreneur";"married";"tertiary";"no";-184;"yes";"yes";"unknown";6;"may";392;2;-1;0;"unknown";"no"
+52;"services";"married";"secondary";"no";660;"no";"no";"unknown";6;"may";201;2;-1;0;"unknown";"no"
+52;"blue-collar";"divorced";"primary";"yes";-183;"yes";"no";"unknown";6;"may";145;1;-1;0;"unknown";"no"
+30;"unemployed";"divorced";"secondary";"no";1144;"yes";"no";"unknown";6;"may";252;1;-1;0;"unknown";"no"
+44;"services";"divorced";"secondary";"no";1;"yes";"no";"unknown";6;"may";235;4;-1;0;"unknown";"no"
+35;"admin.";"married";"secondary";"no";69;"yes";"yes";"unknown";6;"may";235;2;-1;0;"unknown";"no"
+55;"management";"single";"secondary";"no";220;"yes";"no";"unknown";6;"may";328;2;-1;0;"unknown";"no"
+33;"blue-collar";"married";"primary";"no";332;"yes";"no";"unknown";6;"may";116;2;-1;0;"unknown";"no"
+37;"blue-collar";"single";"secondary";"no";240;"yes";"no";"unknown";6;"may";246;1;-1;0;"unknown";"no"
+42;"technician";"single";"secondary";"no";0;"yes";"no";"unknown";6;"may";293;1;-1;0;"unknown";"no"
+43;"unemployed";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";37;2;-1;0;"unknown";"no"
+38;"entrepreneur";"married";"tertiary";"no";898;"yes";"no";"unknown";6;"may";132;2;-1;0;"unknown";"no"
+37;"technician";"married";"secondary";"no";123;"yes";"yes";"unknown";6;"may";530;2;-1;0;"unknown";"no"
+31;"student";"single";"secondary";"no";252;"yes";"no";"unknown";6;"may";175;3;-1;0;"unknown";"no"
+41;"management";"married";"tertiary";"no";65;"yes";"no";"unknown";6;"may";524;2;-1;0;"unknown";"no"
+41;"technician";"married";"secondary";"no";-366;"yes";"yes";"unknown";6;"may";29;3;-1;0;"unknown";"no"
+29;"student";"single";"secondary";"no";209;"yes";"no";"unknown";6;"may";311;2;-1;0;"unknown";"no"
+38;"admin.";"single";"secondary";"no";221;"yes";"no";"unknown";6;"may";211;2;-1;0;"unknown";"no"
+44;"self-employed";"divorced";"tertiary";"no";4;"yes";"no";"unknown";6;"may";312;3;-1;0;"unknown";"no"
+39;"admin.";"married";"secondary";"no";104;"yes";"no";"unknown";6;"may";412;1;-1;0;"unknown";"no"
+28;"technician";"single";"secondary";"no";312;"yes";"no";"unknown";6;"may";392;1;-1;0;"unknown";"no"
+33;"blue-collar";"married";"secondary";"no";-349;"yes";"no";"unknown";6;"may";191;1;-1;0;"unknown";"no"
+41;"services";"married";"unknown";"no";4;"no";"no";"unknown";6;"may";284;2;-1;0;"unknown";"no"
+40;"blue-collar";"married";"primary";"no";-322;"yes";"yes";"unknown";6;"may";144;1;-1;0;"unknown";"no"
+29;"admin.";"married";"secondary";"no";-150;"yes";"no";"unknown";6;"may";328;1;-1;0;"unknown";"no"
+38;"management";"married";"unknown";"no";1349;"yes";"no";"unknown";6;"may";100;1;-1;0;"unknown";"no"
+32;"admin.";"married";"tertiary";"no";281;"yes";"no";"unknown";6;"may";226;1;-1;0;"unknown";"no"
+45;"services";"married";"secondary";"no";1259;"yes";"no";"unknown";6;"may";507;1;-1;0;"unknown";"no"
+33;"admin.";"single";"secondary";"no";101;"yes";"no";"unknown";6;"may";392;1;-1;0;"unknown";"no"
+34;"blue-collar";"married";"secondary";"no";848;"yes";"no";"unknown";6;"may";684;2;-1;0;"unknown";"no"
+41;"entrepreneur";"married";"unknown";"no";89;"yes";"no";"unknown";6;"may";333;2;-1;0;"unknown";"no"
+41;"blue-collar";"married";"secondary";"no";140;"yes";"no";"unknown";6;"may";311;3;-1;0;"unknown";"no"
+35;"admin.";"single";"secondary";"no";148;"yes";"no";"unknown";6;"may";128;2;-1;0;"unknown";"no"
+40;"technician";"single";"secondary";"no";200;"yes";"no";"unknown";6;"may";322;2;-1;0;"unknown";"no"
+60;"self-employed";"married";"primary";"no";46;"yes";"no";"unknown";6;"may";202;4;-1;0;"unknown";"no"
+47;"services";"divorced";"secondary";"no";201;"yes";"no";"unknown";6;"may";92;2;-1;0;"unknown";"no"
+46;"blue-collar";"married";"primary";"no";530;"yes";"no";"unknown";6;"may";739;3;-1;0;"unknown";"no"
+31;"blue-collar";"single";"secondary";"no";0;"yes";"no";"unknown";6;"may";273;2;-1;0;"unknown";"no"
+49;"self-employed";"married";"secondary";"no";1;"yes";"no";"unknown";6;"may";260;3;-1;0;"unknown";"no"
+29;"blue-collar";"married";"secondary";"no";43;"yes";"no";"unknown";6;"may";268;2;-1;0;"unknown";"no"
+31;"management";"single";"tertiary";"no";-173;"yes";"no";"unknown";6;"may";396;2;-1;0;"unknown";"no"
+38;"management";"married";"tertiary";"no";389;"yes";"no";"unknown";6;"may";262;1;-1;0;"unknown";"no"
+37;"blue-collar";"married";"primary";"no";215;"yes";"yes";"unknown";6;"may";308;3;-1;0;"unknown";"no"
+35;"technician";"married";"secondary";"no";-131;"yes";"no";"unknown";6;"may";467;2;-1;0;"unknown";"no"
+31;"management";"single";"secondary";"no";783;"yes";"no";"unknown";6;"may";320;1;-1;0;"unknown";"no"
+41;"admin.";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";160;3;-1;0;"unknown";"no"
+46;"services";"married";"unknown";"no";80;"yes";"no";"unknown";6;"may";245;2;-1;0;"unknown";"no"
+40;"services";"divorced";"secondary";"no";105;"yes";"no";"unknown";6;"may";189;2;-1;0;"unknown";"no"
+29;"admin.";"married";"secondary";"no";182;"yes";"yes";"unknown";6;"may";477;1;-1;0;"unknown";"no"
+49;"admin.";"married";"secondary";"no";82;"yes";"no";"unknown";6;"may";310;1;-1;0;"unknown";"no"
+42;"management";"married";"tertiary";"no";0;"yes";"no";"unknown";6;"may";65;3;-1;0;"unknown";"no"
+54;"services";"married";"secondary";"no";510;"yes";"no";"unknown";6;"may";196;2;-1;0;"unknown";"no"
+40;"management";"single";"tertiary";"no";242;"yes";"no";"unknown";6;"may";221;2;-1;0;"unknown";"no"
+53;"admin.";"married";"secondary";"no";244;"yes";"yes";"unknown";6;"may";197;2;-1;0;"unknown";"no"
+49;"management";"married";"tertiary";"no";92;"yes";"no";"unknown";6;"may";221;2;-1;0;"unknown";"no"
+40;"blue-collar";"married";"primary";"yes";0;"yes";"no";"unknown";6;"may";64;2;-1;0;"unknown";"no"
+29;"student";"single";"secondary";"no";948;"yes";"no";"unknown";6;"may";75;2;-1;0;"unknown";"no"
+36;"blue-collar";"married";"primary";"no";23;"yes";"no";"unknown";6;"may";400;2;-1;0;"unknown";"no"
+37;"technician";"married";"secondary";"no";710;"yes";"no";"unknown";6;"may";378;3;-1;0;"unknown";"no"
+39;"services";"married";"secondary";"no";1205;"yes";"no";"unknown";6;"may";118;2;-1;0;"unknown";"no"
+36;"technician";"married";"secondary";"no";368;"yes";"yes";"unknown";6;"may";1597;2;-1;0;"unknown";"yes"
+44;"entrepreneur";"married";"tertiary";"no";1631;"yes";"no";"unknown";6;"may";346;2;-1;0;"unknown";"no"
+40;"admin.";"married";"secondary";"no";6;"yes";"no";"unknown";6;"may";60;3;-1;0;"unknown";"no"
+49;"blue-collar";"married";"secondary";"no";26;"yes";"no";"unknown";6;"may";276;2;-1;0;"unknown";"no"
+30;"technician";"single";"unknown";"no";-48;"yes";"no";"unknown";6;"may";152;2;-1;0;"unknown";"no"
+57;"management";"married";"tertiary";"no";2142;"yes";"no";"unknown";6;"may";251;3;-1;0;"unknown";"no"
+24;"services";"single";"secondary";"no";77;"yes";"yes";"unknown";6;"may";390;2;-1;0;"unknown";"no"
+46;"blue-collar";"married";"unknown";"no";401;"yes";"no";"unknown";6;"may";306;2;-1;0;"unknown";"no"
+33;"admin.";"married";"secondary";"no";21;"no";"no";"unknown";6;"may";189;3;-1;0;"unknown";"no"
+43;"services";"divorced";"secondary";"no";0;"yes";"no";"unknown";6;"may";125;2;-1;0;"unknown";"no"
+43;"admin.";"single";"secondary";"no";-497;"yes";"no";"unknown";6;"may";234;2;-1;0;"unknown";"no"
+40;"blue-collar";"divorced";"primary";"no";369;"no";"no";"unknown";6;"may";79;2;-1;0;"unknown";"no"
+44;"technician";"single";"unknown";"no";78;"yes";"no";"unknown";6;"may";13;6;-1;0;"unknown";"no"
+35;"technician";"single";"tertiary";"no";226;"yes";"yes";"unknown";6;"may";283;3;-1;0;"unknown";"no"
+47;"technician";"married";"secondary";"no";503;"yes";"no";"unknown";6;"may";109;2;-1;0;"unknown";"no"
+33;"blue-collar";"married";"secondary";"no";372;"yes";"no";"unknown";6;"may";132;2;-1;0;"unknown";"no"
+31;"admin.";"married";"secondary";"no";0;"yes";"yes";"unknown";6;"may";144;2;-1;0;"unknown";"no"
+40;"blue-collar";"divorced";"secondary";"no";0;"yes";"no";"unknown";6;"may";121;2;-1;0;"unknown";"no"
+36;"entrepreneur";"married";"tertiary";"no";125;"yes";"no";"unknown";6;"may";95;3;-1;0;"unknown";"no"
+56;"retired";"divorced";"primary";"no";4;"yes";"no";"unknown";6;"may";31;3;-1;0;"unknown";"no"
+40;"admin.";"single";"unknown";"no";419;"yes";"no";"unknown";6;"may";112;3;-1;0;"unknown";"no"
+41;"admin.";"divorced";"secondary";"no";322;"yes";"no";"unknown";6;"may";87;4;-1;0;"unknown";"no"
+53;"retired";"married";"secondary";"no";303;"yes";"no";"unknown";6;"may";593;2;-1;0;"unknown";"no"
+39;"blue-collar";"married";"secondary";"no";607;"yes";"no";"unknown";6;"may";99;2;-1;0;"unknown";"no"
+44;"blue-collar";"divorced";"secondary";"no";579;"yes";"no";"unknown";6;"may";198;2;-1;0;"unknown";"no"
+38;"admin.";"married";"secondary";"no";3047;"yes";"no";"unknown";6;"may";285;2;-1;0;"unknown";"no"
+54;"technician";"divorced";"secondary";"no";83;"yes";"no";"unknown";6;"may";190;3;-1;0;"unknown";"no"
+58;"management";"married";"tertiary";"no";68;"yes";"no";"unknown";6;"may";172;5;-1;0;"unknown";"no"
+52;"blue-collar";"married";"primary";"no";58;"yes";"no";"unknown";6;"may";213;3;-1;0;"unknown";"no"
+28;"admin.";"single";"secondary";"no";251;"yes";"no";"unknown";6;"may";178;2;-1;0;"unknown";"no"
+36;"blue-collar";"married";"secondary";"no";688;"yes";"no";"unknown";6;"may";174;2;-1;0;"unknown";"no"
+60;"retired";"married";"primary";"no";364;"yes";"no";"unknown";6;"may";631;2;-1;0;"unknown";"no"
+42;"services";"divorced";"secondary";"no";55;"yes";"no";"unknown";6;"may";176;5;-1;0;"unknown";"no"
+42;"admin.";"married";"secondary";"no";101;"yes";"no";"unknown";6;"may";32;3;-1;0;"unknown";"no"
+44;"management";"married";"tertiary";"no";105;"yes";"no";"unknown";6;"may";1529;2;-1;0;"unknown";"no"
+51;"blue-collar";"divorced";"primary";"no";325;"yes";"no";"unknown";6;"may";254;2;-1;0;"unknown";"no"
+49;"blue-collar";"married";"primary";"no";198;"yes";"no";"unknown";6;"may";200;2;-1;0;"unknown";"no"
+47;"entrepreneur";"married";"unknown";"no";209;"yes";"no";"unknown";6;"may";135;2;-1;0;"unknown";"no"
+37;"blue-collar";"married";"secondary";"no";183;"yes";"no";"unknown";6;"may";112;4;-1;0;"unknown";"no"
+34;"management";"married";"tertiary";"no";105;"yes";"no";"unknown";6;"may";314;3;-1;0;"unknown";"no"
+35;"services";"married";"secondary";"no";109;"yes";"no";"unknown";6;"may";597;3;-1;0;"unknown";"no"
+35;"blue-collar";"single";"secondary";"no";376;"yes";"yes";"unknown";6;"may";207;3;-1;0;"unknown";"no"
+40;"blue-collar";"married";"primary";"no";-7;"yes";"no";"unknown";6;"may";410;2;-1;0;"unknown";"no"
+55;"technician";"married";"secondary";"no";0;"no";"no";"unknown";6;"may";160;3;-1;0;"unknown";"no"
+55;"retired";"married";"secondary";"no";143;"yes";"no";"unknown";6;"may";42;3;-1;0;"unknown";"no"
+35;"management";"single";"tertiary";"no";550;"yes";"no";"unknown";6;"may";55;2;-1;0;"unknown";"no"
+57;"blue-collar";"married";"primary";"no";162;"yes";"no";"unknown";6;"may";155;2;-1;0;"unknown";"no"
+53;"management";"married";"tertiary";"no";115;"yes";"no";"unknown";6;"may";336;3;-1;0;"unknown";"no"
+41;"blue-collar";"married";"primary";"no";512;"yes";"no";"unknown";6;"may";233;2;-1;0;"unknown";"no"
+57;"blue-collar";"married";"unknown";"no";807;"yes";"no";"unknown";6;"may";211;2;-1;0;"unknown";"no"
+45;"blue-collar";"married";"unknown";"no";248;"yes";"no";"unknown";6;"may";88;5;-1;0;"unknown";"no"
+43;"blue-collar";"married";"primary";"no";1211;"yes";"no";"unknown";6;"may";208;3;-1;0;"unknown";"no"
+56;"self-employed";"married";"unknown";"no";7;"no";"no";"unknown";6;"may";305;2;-1;0;"unknown";"no"
+31;"entrepreneur";"married";"tertiary";"no";281;"yes";"no";"unknown";6;"may";206;2;-1;0;"unknown";"no"
+37;"blue-collar";"single";"secondary";"no";88;"yes";"no";"unknown";6;"may";128;2;-1;0;"unknown";"no"
+30;"management";"married";"tertiary";"no";32;"yes";"no";"unknown";6;"may";122;3;-1;0;"unknown";"no"
+30;"admin.";"single";"secondary";"no";115;"yes";"no";"unknown";6;"may";66;3;-1;0;"unknown";"no"
+54;"blue-collar";"married";"secondary";"no";254;"yes";"no";"unknown";6;"may";66;2;-1;0;"unknown";"no"
+36;"management";"married";"tertiary";"no";144;"yes";"no";"unknown";6;"may";164;2;-1;0;"unknown";"no"
+55;"unemployed";"married";"tertiary";"no";383;"no";"no";"unknown";6;"may";343;3;-1;0;"unknown";"no"
+37;"admin.";"single";"secondary";"no";569;"yes";"yes";"unknown";6;"may";126;2;-1;0;"unknown";"no"
+38;"housemaid";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";59;3;-1;0;"unknown";"no"
+48;"admin.";"married";"secondary";"no";3754;"yes";"no";"unknown";6;"may";249;3;-1;0;"unknown";"no"
+55;"housemaid";"divorced";"tertiary";"no";6920;"yes";"no";"unknown";6;"may";406;3;-1;0;"unknown";"no"
+59;"services";"married";"secondary";"no";307;"yes";"yes";"unknown";6;"may";250;7;-1;0;"unknown";"no"
+37;"technician";"married";"secondary";"no";-421;"yes";"no";"unknown";6;"may";183;5;-1;0;"unknown";"no"
+33;"blue-collar";"divorced";"secondary";"no";60;"no";"no";"unknown";6;"may";190;3;-1;0;"unknown";"no"
+44;"blue-collar";"married";"primary";"no";67;"yes";"no";"unknown";6;"may";220;2;-1;0;"unknown";"no"
+57;"technician";"married";"secondary";"no";402;"yes";"no";"unknown";6;"may";153;3;-1;0;"unknown";"no"
+30;"self-employed";"single";"tertiary";"no";800;"no";"no";"unknown";6;"may";95;2;-1;0;"unknown";"no"
+42;"technician";"married";"tertiary";"no";239;"yes";"yes";"unknown";6;"may";191;3;-1;0;"unknown";"no"
+51;"blue-collar";"divorced";"secondary";"no";421;"yes";"no";"unknown";6;"may";216;2;-1;0;"unknown";"no"
+44;"admin.";"divorced";"secondary";"no";161;"yes";"no";"unknown";7;"may";89;2;-1;0;"unknown";"no"
+46;"technician";"married";"secondary";"yes";289;"no";"no";"unknown";7;"may";51;3;-1;0;"unknown";"no"
+29;"student";"single";"secondary";"no";110;"yes";"no";"unknown";7;"may";169;3;-1;0;"unknown";"no"
+39;"blue-collar";"married";"primary";"no";245;"yes";"no";"unknown";7;"may";148;3;-1;0;"unknown";"no"
+42;"services";"married";"secondary";"no";0;"yes";"no";"unknown";7;"may";132;3;-1;0;"unknown";"no"
+50;"blue-collar";"married";"primary";"no";156;"yes";"no";"unknown";7;"may";117;3;-1;0;"unknown";"no"
+42;"technician";"single";"secondary";"no";0;"yes";"no";"unknown";7;"may";275;4;-1;0;"unknown";"no"
+39;"admin.";"married";"secondary";"no";20;"yes";"no";"unknown";7;"may";124;2;-1;0;"unknown";"no"
+55;"technician";"single";"tertiary";"no";92;"yes";"no";"unknown";7;"may";118;3;-1;0;"unknown";"no"
+46;"services";"married";"secondary";"no";89;"yes";"no";"unknown";7;"may";479;2;-1;0;"unknown";"no"
+42;"blue-collar";"married";"secondary";"no";166;"yes";"no";"unknown";7;"may";285;3;-1;0;"unknown";"no"
+45;"management";"married";"tertiary";"no";103;"yes";"no";"unknown";7;"may";35;4;-1;0;"unknown";"no"
+43;"blue-collar";"married";"primary";"no";-454;"yes";"no";"unknown";7;"may";322;2;-1;0;"unknown";"no"
+42;"admin.";"married";"secondary";"no";445;"yes";"no";"unknown";7;"may";202;2;-1;0;"unknown";"no"
+30;"admin.";"married";"secondary";"no";4;"no";"no";"unknown";7;"may";172;8;-1;0;"unknown";"no"
+47;"blue-collar";"married";"secondary";"no";1001;"yes";"no";"unknown";7;"may";201;4;-1;0;"unknown";"no"
+51;"services";"divorced";"secondary";"no";-69;"yes";"no";"unknown";7;"may";216;3;-1;0;"unknown";"no"
+38;"technician";"single";"secondary";"no";42;"yes";"no";"unknown";7;"may";195;2;-1;0;"unknown";"no"
+57;"technician";"married";"unknown";"no";1617;"yes";"no";"unknown";7;"may";96;2;-1;0;"unknown";"no"
+42;"management";"divorced";"tertiary";"no";221;"yes";"no";"unknown";7;"may";720;2;-1;0;"unknown";"no"
+32;"technician";"divorced";"secondary";"no";210;"yes";"yes";"unknown";7;"may";188;2;-1;0;"unknown";"no"
+46;"management";"married";"tertiary";"no";0;"no";"no";"unknown";7;"may";70;2;-1;0;"unknown";"no"
+29;"student";"single";"tertiary";"no";185;"yes";"no";"unknown";7;"may";141;3;-1;0;"unknown";"no"
+59;"retired";"married";"secondary";"no";836;"yes";"no";"unknown";7;"may";106;1;-1;0;"unknown";"no"
+32;"blue-collar";"single";"secondary";"no";301;"yes";"no";"unknown";7;"may";395;2;-1;0;"unknown";"no"
+44;"blue-collar";"married";"primary";"no";503;"yes";"no";"unknown";7;"may";629;2;-1;0;"unknown";"no"
+40;"retired";"married";"primary";"no";407;"yes";"no";"unknown";7;"may";502;1;-1;0;"unknown";"no"
+31;"blue-collar";"single";"secondary";"no";53;"yes";"no";"unknown";7;"may";446;1;-1;0;"unknown";"no"
+46;"self-employed";"married";"tertiary";"no";2303;"yes";"no";"unknown";7;"may";241;1;-1;0;"unknown";"no"
+43;"management";"married";"tertiary";"no";144;"yes";"no";"unknown";7;"may";131;3;-1;0;"unknown";"no"
+34;"services";"married";"secondary";"no";205;"yes";"no";"unknown";7;"may";312;1;-1;0;"unknown";"no"
+39;"management";"married";"tertiary";"no";305;"yes";"no";"unknown";7;"may";275;6;-1;0;"unknown";"no"
+30;"blue-collar";"divorced";"secondary";"no";251;"yes";"yes";"unknown";7;"may";120;2;-1;0;"unknown";"no"
+56;"retired";"married";"secondary";"no";0;"yes";"no";"unknown";7;"may";333;4;-1;0;"unknown";"no"
+29;"technician";"married";"secondary";"no";8;"no";"no";"unknown";7;"may";113;1;-1;0;"unknown";"no"
+40;"blue-collar";"divorced";"secondary";"no";139;"yes";"no";"unknown";7;"may";91;1;-1;0;"unknown";"no"
+36;"services";"married";"secondary";"no";184;"yes";"no";"unknown";7;"may";128;3;-1;0;"unknown";"no"
+37;"blue-collar";"single";"secondary";"no";238;"yes";"no";"unknown";7;"may";200;2;-1;0;"unknown";"no"
+35;"admin.";"married";"secondary";"no";0;"no";"no";"unknown";7;"may";326;1;-1;0;"unknown";"no"
+35;"blue-collar";"married";"primary";"yes";0;"yes";"no";"unknown";7;"may";292;1;-1;0;"unknown";"no"
+47;"services";"married";"primary";"no";222;"yes";"no";"unknown";7;"may";68;1;-1;0;"unknown";"no"
+31;"services";"married";"secondary";"no";414;"yes";"no";"unknown";7;"may";215;1;-1;0;"unknown";"no"
+56;"retired";"single";"primary";"no";223;"yes";"no";"unknown";7;"may";97;1;-1;0;"unknown";"no"
+57;"technician";"married";"secondary";"no";197;"no";"no";"unknown";7;"may";32;1;-1;0;"unknown";"no"
+36;"blue-collar";"married";"secondary";"no";-251;"yes";"no";"unknown";7;"may";162;1;-1;0;"unknown";"no"
+45;"self-employed";"divorced";"secondary";"no";-139;"yes";"no";"unknown";7;"may";152;3;-1;0;"unknown";"no"
+47;"blue-collar";"married";"unknown";"no";733;"yes";"no";"unknown";7;"may";268;1;-1;0;"unknown";"no"
+29;"technician";"single";"tertiary";"no";0;"yes";"no";"unknown";7;"may";104;2;-1;0;"unknown";"no"
+57;"services";"married";"secondary";"no";1;"no";"no";"unknown";7;"may";852;1;-1;0;"unknown";"no"
+45;"blue-collar";"married";"primary";"no";97;"yes";"no";"unknown";7;"may";923;3;-1;0;"unknown";"no"
+31;"blue-collar";"single";"primary";"no";435;"yes";"no";"unknown";7;"may";159;2;-1;0;"unknown";"no"
+31;"management";"divorced";"tertiary";"no";0;"yes";"no";"unknown";7;"may";953;3;-1;0;"unknown";"no"
+37;"technician";"single";"tertiary";"no";147;"no";"no";"unknown";7;"may";416;2;-1;0;"unknown";"no"
+30;"technician";"single";"tertiary";"no";3;"yes";"no";"unknown";7;"may";174;1;-1;0;"unknown";"no"
+58;"services";"divorced";"secondary";"no";1109;"yes";"yes";"unknown";7;"may";180;1;-1;0;"unknown";"no"
+33;"services";"married";"secondary";"no";404;"yes";"no";"unknown";7;"may";139;1;-1;0;"unknown";"no"
+39;"blue-collar";"married";"primary";"no";981;"yes";"no";"unknown";7;"may";294;1;-1;0;"unknown";"no"
+33;"blue-collar";"single";"primary";"no";95;"yes";"no";"unknown";7;"may";102;1;-1;0;"unknown";"no"
+34;"services";"married";"secondary";"no";302;"yes";"no";"unknown";7;"may";124;1;-1;0;"unknown";"no"
+36;"services";"divorced";"secondary";"no";-290;"yes";"yes";"unknown";7;"may";128;1;-1;0;"unknown";"no"
+37;"services";"single";"secondary";"no";259;"yes";"no";"unknown";7;"may";130;1;-1;0;"unknown";"no"
+35;"blue-collar";"married";"secondary";"no";527;"yes";"yes";"unknown";7;"may";143;1;-1;0;"unknown";"no"
+55;"retired";"married";"secondary";"no";102;"yes";"no";"unknown";7;"may";74;1;-1;0;"unknown";"no"
+34;"management";"single";"tertiary";"no";872;"yes";"no";"unknown";7;"may";105;2;-1;0;"unknown";"no"
+40;"management";"divorced";"tertiary";"no";490;"yes";"no";"unknown";7;"may";477;2;-1;0;"unknown";"no"
+42;"blue-collar";"single";"primary";"no";19;"yes";"no";"unknown";7;"may";158;1;-1;0;"unknown";"no"
+37;"blue-collar";"married";"secondary";"no";16;"yes";"no";"unknown";7;"may";250;1;-1;0;"unknown";"no"
+42;"management";"married";"tertiary";"no";386;"yes";"no";"unknown";7;"may";168;1;-1;0;"unknown";"no"
+35;"technician";"single";"secondary";"no";539;"yes";"no";"unknown";7;"may";520;1;-1;0;"unknown";"no"
+44;"technician";"divorced";"secondary";"no";-329;"yes";"no";"unknown";7;"may";171;1;-1;0;"unknown";"no"
+30;"services";"single";"secondary";"no";-174;"yes";"no";"unknown";7;"may";113;1;-1;0;"unknown";"no"
+45;"entrepreneur";"married";"secondary";"no";68;"yes";"no";"unknown";7;"may";254;1;-1;0;"unknown";"no"
+35;"blue-collar";"single";"unknown";"yes";-532;"yes";"no";"unknown";7;"may";149;1;-1;0;"unknown";"no"
+36;"admin.";"divorced";"secondary";"no";0;"yes";"no";"unknown";7;"may";133;2;-1;0;"unknown";"no"
+49;"blue-collar";"married";"secondary";"no";64;"yes";"no";"unknown";7;"may";293;3;-1;0;"unknown";"no"
+31;"blue-collar";"single";"secondary";"no";1415;"yes";"no";"unknown";7;"may";485;1;-1;0;"unknown";"no"
+31;"technician";"single";"secondary";"no";147;"yes";"no";"unknown";7;"may";374;1;-1;0;"unknown";"no"
+39;"blue-collar";"married";"secondary";"no";72;"yes";"no";"unknown";7;"may";425;6;-1;0;"unknown";"no"
+37;"services";"single";"secondary";"no";-196;"yes";"no";"unknown";7;"may";207;1;-1;0;"unknown";"no"
+33;"blue-collar";"married";"primary";"no";716;"yes";"no";"unknown";7;"may";83;3;-1;0;"unknown";"no"
+37;"management";"married";"tertiary";"no";0;"yes";"no";"unknown";7;"may";228;1;-1;0;"unknown";"no"
+42;"services";"married";"secondary";"no";-246;"yes";"no";"unknown";7;"may";149;1;-1;0;"unknown";"no"
+56;"blue-collar";"married";"secondary";"no";-203;"yes";"no";"unknown";7;"may";139;1;-1;0;"unknown";"no"
+37;"admin.";"single";"secondary";"no";245;"yes";"yes";"unknown";7;"may";732;2;-1;0;"unknown";"yes"
+36;"services";"single";"secondary";"no";342;"yes";"no";"unknown";7;"may";142;1;-1;0;"unknown";"no"
+29;"technician";"single";"tertiary";"no";3;"yes";"no";"unknown";7;"may";359;1;-1;0;"unknown";"no"
+54;"management";"married";"tertiary";"yes";-248;"yes";"yes";"unknown";7;"may";112;1;-1;0;"unknown";"no"
+38;"blue-collar";"married";"secondary";"no";376;"yes";"no";"unknown";7;"may";1521;1;-1;0;"unknown";"no"
+43;"blue-collar";"divorced";"secondary";"no";370;"yes";"no";"unknown";7;"may";216;1;-1;0;"unknown";"no"
+47;"admin.";"single";"secondary";"no";594;"yes";"no";"unknown";7;"may";161;1;-1;0;"unknown";"no"
+47;"blue-collar";"married";"secondary";"no";387;"yes";"no";"unknown";7;"may";122;2;-1;0;"unknown";"no"
+38;"services";"married";"secondary";"no";208;"yes";"no";"unknown";7;"may";800;1;-1;0;"unknown";"no"
+40;"blue-collar";"married";"secondary";"no";563;"yes";"no";"unknown";7;"may";615;1;-1;0;"unknown";"no"
+33;"services";"divorced";"secondary";"no";392;"yes";"yes";"unknown";7;"may";254;1;-1;0;"unknown";"no"
+33;"retired";"married";"secondary";"no";165;"no";"no";"unknown";7;"may";111;1;-1;0;"unknown";"no"
+53;"admin.";"divorced";"unknown";"no";236;"yes";"no";"unknown";7;"may";354;1;-1;0;"unknown";"no"
+37;"services";"married";"primary";"no";52;"yes";"no";"unknown";7;"may";359;1;-1;0;"unknown";"no"
+40;"management";"single";"tertiary";"no";1265;"yes";"no";"unknown";7;"may";97;1;-1;0;"unknown";"no"
+37;"blue-collar";"married";"primary";"no";693;"yes";"no";"unknown";7;"may";327;3;-1;0;"unknown";"no"
+35;"technician";"married";"secondary";"no";118;"yes";"no";"unknown";7;"may";236;1;-1;0;"unknown";"no"
+49;"blue-collar";"married";"primary";"no";3659;"yes";"no";"unknown";7;"may";160;1;-1;0;"unknown";"no"
+26;"blue-collar";"single";"secondary";"no";24;"yes";"no";"unknown";7;"may";180;1;-1;0;"unknown";"no"
+38;"management";"single";"tertiary";"no";673;"yes";"no";"unknown";7;"may";184;1;-1;0;"unknown";"no"
+52;"self-employed";"married";"secondary";"no";273;"no";"no";"unknown";7;"may";227;1;-1;0;"unknown";"no"
+33;"services";"divorced";"secondary";"no";327;"yes";"no";"unknown";7;"may";109;1;-1;0;"unknown";"no"
+31;"admin.";"single";"secondary";"no";299;"yes";"no";"unknown";7;"may";492;2;-1;0;"unknown";"no"
+32;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";7;"may";298;1;-1;0;"unknown";"no"
+35;"blue-collar";"single";"primary";"no";109;"yes";"no";"unknown";7;"may";83;2;-1;0;"unknown";"no"
+55;"management";"divorced";"tertiary";"no";552;"no";"no";"unknown";7;"may";241;2;-1;0;"unknown";"no"
+32;"blue-collar";"divorced";"primary";"no";473;"yes";"no";"unknown";7;"may";204;2;-1;0;"unknown";"no"
+37;"unknown";"single";"unknown";"no";414;"yes";"no";"unknown";7;"may";131;1;-1;0;"unknown";"no"
+45;"blue-collar";"m

<TRUNCATED>
r***@apache.org
2018-06-28 14:54:58 UTC
Permalink
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/resources/cf-data-purchase.txt
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/resources/cf-data-purchase.txt b/community/mahout-mr/mr-examples/src/main/resources/cf-data-purchase.txt
new file mode 100644
index 0000000..d87c031
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/resources/cf-data-purchase.txt
@@ -0,0 +1,7 @@
+u1,iphone
+u1,ipad
+u2,nexus
+u2,galaxy
+u3,surface
+u4,iphone
+u4,galaxy

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/resources/cf-data-view.txt
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/resources/cf-data-view.txt b/community/mahout-mr/mr-examples/src/main/resources/cf-data-view.txt
new file mode 100644
index 0000000..09ad9b6
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/resources/cf-data-view.txt
@@ -0,0 +1,12 @@
+u1,ipad
+u1,nexus
+u1,galaxy
+u2,iphone
+u2,ipad
+u2,nexus
+u2,galaxy
+u3,surface
+u3,nexus
+u4,iphone
+u4,ipad
+u4,galaxy

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/resources/donut-test.csv
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/resources/donut-test.csv b/community/mahout-mr/mr-examples/src/main/resources/donut-test.csv
new file mode 100644
index 0000000..46ea564
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/resources/donut-test.csv
@@ -0,0 +1,41 @@
+"x","y","shape","color","xx","xy","yy","c","a","b"
+0.802415437065065,0.0978854028508067,21,2,0.643870533640319,0.07854475831082,0.00958155209126472,0.503141377562721,0.808363832523192,0.220502180491382
+0.97073650965467,0.989339149091393,23,2,0.942329371176533,0.96038763245370,0.978791951924881,0.67900343471543,1.38604520961670,0.989771844311643
+0.566630310611799,0.369259539060295,25,1,0.321069908904024,0.209233647314105,0.136352607187021,0.146740132271139,0.676330182744379,0.569352171215186
+0.377948862500489,0.500907538458705,24,1,0.142845342665413,0.189317434378387,0.250908362084759,0.122054511555201,0.62749797190921,0.79865886318828
+0.0133881184738129,0.269793515326455,25,2,0.000179241716268851,0.00361202754665705,0.0727885409122062,0.538317888266967,0.270125494221621,1.02283505301727
+0.395229484187439,0.385281964903697,25,1,0.156206345171069,0.152274792255611,0.148442192480054,0.155361155247979,0.551949760078871,0.717070128562224
+0.757145672803745,0.416044564917684,21,1,0.573269569845435,0.315006342020941,0.173093079997545,0.270503996498299,0.863922826323613,0.481737796145881
+0.589166145538911,0.971624446567148,24,2,0.347116747049177,0.572448230095344,0.944054065166917,0.479979395505718,1.13629697360157,1.05491161769044
+0.843438957352191,0.218833807157353,25,2,0.711389274779351,0.184572958142208,0.0478882351549814,0.443852166182378,0.871365313708512,0.269071728782402
+0.628562391968444,0.801476288354024,25,2,0.395090680597092,0.503777852913796,0.642364240793743,0.327744170151609,1.01855531091386,0.8833629703887
+0.262267543468624,0.247060472844169,22,2,0.0687842643570668,0.0647959433010369,0.0610388772419841,0.347124077652729,0.360309785599907,0.778002605819416
+0.738417695043609,0.562460686312988,21,1,0.545260692353516,0.415330923539883,0.316362023647678,0.246463657857698,0.928236347058869,0.620312280963368
+0.498857178725302,0.164454092038795,21,1,0.248858484765768,0.0820391043843046,0.0270451483883046,0.335547854098302,0.525265297877247,0.527436513434051
+0.499293045606464,0.733599063009024,25,1,0.249293545390979,0.366280910423824,0.538167585247717,0.233600132755117,0.88739006679064,0.888186376514393
+0.553942533675581,0.548312899889424,24,1,0.306852330614922,0.303733837011753,0.30064703618515,0.0724150069741539,0.779422457207946,0.706833997094728
+0.661088703200221,0.98143746308051,24,2,0.43703827349895,0.64881721974001,0.963219493937908,0.507672730364875,1.1833248782295,1.03830648704340
+0.492181566543877,0.376017479225993,23,1,0.242242694445585,0.185068871973329,0.141389144683470,0.124228794404457,0.619380205632255,0.63187712891139
+0.991064163157716,0.216620326042175,21,2,0.982208175495505,0.21468464215194,0.0469243656546183,0.566963889458783,1.01446170018888,0.21680455446021
+0.601602173643187,0.343355831922963,24,1,0.361925175332207,0.206563614817919,0.117893227315510,0.186709392055052,0.692689254029335,0.52594111396747
+0.0397100185509771,0.0602901463862509,25,2,0.00157688557331895,0.00239412283143915,0.00363490175127556,0.636562347604197,0.0721927096360464,0.962180726382856
+0.158290433697402,0.630195834673941,23,2,0.0250558614001118,0.0997539719848347,0.397146790040385,0.365672507948237,0.649771230080632,1.05148551299849
+0.967184047214687,0.497705311980098,25,2,0.935444981186582,0.48137263796116,0.247710577573207,0.467189682639721,1.08772954302059,0.498785990511377
+0.538070349488407,0.0130743277259171,24,2,0.289519700998577,0.00703490808881019,0.000170938045484685,0.488411672495383,0.538229169633216,0.462114639529248
+0.758642012253404,0.673675778554752,25,2,0.575537702755893,0.511078748249156,0.453839054611352,0.311542880770993,1.01458206044028,0.715606548922268
+0.986405614530668,0.981674374546856,21,2,0.972996036377624,0.9683291146939,0.96368457764196,0.684544100071034,1.39164672744903,0.981768498658543
+0.51937106740661,0.462004136526957,23,1,0.269746305659081,0.239951581534275,0.213447822168019,0.0426488439882434,0.695121664046734,0.666672328069706
+0.534244359936565,0.692785677267238,21,1,0.28541703612403,0.370116840724856,0.479951994626626,0.195803456422130,0.87485371963012,0.83479357381183
+0.0795328004751354,0.536029864801094,22,2,0.00632546635141770,0.0426319562859392,0.287328015958679,0.422008076977050,0.541898036820671,1.06517035321108
+0.330987347057089,0.804738595616072,23,2,0.10955262391189,0.266358292837412,0.647604207274128,0.348469350894533,0.870147591610767,1.04650950166343
+0.9804020607844,0.74571731640026,25,2,0.961188200790297,0.731102793761427,0.556094315979205,0.539595348001485,1.23178022259229,0.745974795285138
+0.362560331821442,0.805498170899227,21,2,0.131449994210474,0.292041684122788,0.648827303322001,0.334990738397057,0.883333061496328,1.02720817456326
+0.47635925677605,0.961423690896481,21,2,0.226918141516230,0.457983074842334,0.924335513417013,0.462028903057712,1.07296488988841,1.09477629741475
+0.850710266502574,0.635807712096721,24,2,0.723707957532881,0.540888148202193,0.404251446761667,0.376086992190972,1.06205433208219,0.65309943445803
+0.136131341336295,0.714137809583917,25,2,0.0185317420940189,0.0972165379176223,0.509992811077315,0.422203034393551,0.726996941651981,1.12083088398685
+0.930458213202655,0.865616530412808,24,2,0.865752486516278,0.805420010206583,0.749291977723908,0.564774043865972,1.27084399681479,0.868405457050378
+0.374636142514646,0.197784703457728,21,2,0.140352239278254,0.0740972983518064,0.0391187889218614,0.327185241457712,0.423640210792266,0.655895375171089
+0.482126326300204,0.841961156809703,22,1,0.232445794511731,0.405931639420132,0.708898589576332,0.342427950053959,0.970229036922758,0.988479504839456
+0.660344187868759,0.746531683253124,24,2,0.436054446452051,0.492967858096082,0.557309554100743,0.294088642131774,0.996676477375078,0.82016804669243
+0.0772640188224614,0.437956433976069,22,2,0.00596972860459766,0.0338382741581451,0.191805838061035,0.427264688298837,0.444719649515999,1.02139489377063
+0.998469967395067,0.464829172473401,25,2,0.996942275789907,0.464117968683793,0.216066159582307,0.499709210945471,1.10136662168971,0.464831690595724

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/resources/donut.csv
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/resources/donut.csv b/community/mahout-mr/mr-examples/src/main/resources/donut.csv
new file mode 100644
index 0000000..33ba3b7
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/resources/donut.csv
@@ -0,0 +1,41 @@
+"x","y","shape","color","k","k0","xx","xy","yy","a","b","c","bias"
+0.923307513352484,0.0135197141207755,21,2,4,8,0.852496764213146,0.0124828536260896,0.000182782669907495,0.923406490600458,0.0778750292332978,0.644866125183976,1
+0.711011884035543,0.909141522599384,22,2,3,9,0.505537899239772,0.64641042683833,0.826538308114327,1.15415605849213,0.953966686673604,0.46035073663368,1
+0.75118898646906,0.836567111080512,23,2,3,9,0.564284893392414,0.62842000028592,0.699844531341594,1.12433510339845,0.872783737128441,0.419968245447719,1
+0.308209649519995,0.418023289414123,24,1,5,1,0.094993188057238,0.128838811521522,0.174743470492603,0.519361780024138,0.808280495564412,0.208575453051705,1
+0.849057961953804,0.500220163026825,25,1,5,2,0.720899422757147,0.424715912147755,0.250220211498583,0.985454024425153,0.52249756970547,0.349058031386046,1
+0.0738831346388906,0.486534863477573,21,2,6,1,0.00545871758406844,0.0359467208248278,0.236716173379140,0.492112681164801,1.04613986717142,0.42632955896436,1
+0.612888508243486,0.0204555552918464,22,2,4,10,0.375632323536926,0.0125369747681119,0.000418429742297785,0.613229772009826,0.387651566219268,0.492652707029903,1
+0.207169560948387,0.932857288978994,23,2,1,4,0.0429192269835473,0.193259634985281,0.870222721601238,0.955584610897845,1.22425602987611,0.522604151014326,1
+0.309267645236105,0.506309477845207,24,1,5,1,0.0956464763898851,0.156585139973909,0.256349287355886,0.593292308854389,0.856423069092351,0.190836685845410,1
+0.78758287569508,0.171928803203627,25,2,4,10,0.620286786088131,0.135408181241926,0.0295595133710317,0.806130448165285,0.273277419610556,0.436273561610666,1
+0.930236018029973,0.0790199618786573,21,2,4,8,0.86533904924026,0.0735072146828825,0.00624415437530446,0.93358620577618,0.105409523078414,0.601936228937031,1
+0.238834470743313,0.623727766098455,22,1,5,1,0.0570419044152386,0.148967690904034,0.389036326202168,0.667890882268509,0.984077887735915,0.288991338582386,1
+0.83537525916472,0.802311758277938,23,2,3,7,0.697851823624524,0.670231393002335,0.643704157471036,1.15825557675997,0.819027144096042,0.451518508649315,1
+0.656760312616825,0.320640653371811,24,1,5,3,0.43133410822855,0.210584055746134,0.102810428594702,0.730851925374252,0.469706197095164,0.238209090579297,1
+0.180789119331166,0.114329558331519,25,2,2,5,0.0326847056685386,0.0206695401642766,0.0130712479082803,0.213906413126907,0.82715035810576,0.500636870310341,1
+0.990028728265315,0.061085847672075,21,2,4,8,0.980156882790638,0.0604767440857932,0.00373148078581595,0.991911469626425,0.06189432159595,0.657855445853466,1
+0.751934139290825,0.972332585137337,22,2,3,9,0.565404949831033,0.731130065509666,0.945430656119858,1.22916052895905,1.00347761677540,0.535321288127727,1
+0.136412925552577,0.552212274167687,23,2,6,1,0.0186084862578129,0.0753288918452558,0.304938395741448,0.5688118159807,1.02504684326820,0.3673168690368,1
+0.5729476721026,0.0981996888294816,24,2,4,10,0.328269034967789,0.0562632831160512,0.0096431788862070,0.581302170866406,0.43819729534628,0.408368525870829,1
+0.446335297077894,0.339370004367083,25,1,5,3,0.199215197417612,0.151472811718508,0.115171999864114,0.560702414192882,0.649397107420365,0.169357302283512,1
+0.922843366628513,0.912627586396411,21,2,3,7,0.851639879330248,0.842212314308118,0.832889111451739,1.29789405992245,0.915883320912091,0.590811338548155,1
+0.166969822719693,0.398156099021435,22,2,6,1,0.0278789216990458,0.0664800532683736,0.158528279187967,0.431749002184154,0.923291695753637,0.348254618269284,1
+0.350683249300346,0.84422400011681,23,2,1,6,0.122978741339848,0.296055215498298,0.712714162373228,0.914162405545687,1.06504760696993,0.375214144584023,1
+0.47748578293249,0.792779305484146,24,1,5,6,0.227992672902653,0.378540847371773,0.628499027203925,0.9254683679665,0.949484141121692,0.29364368150863,1
+0.384564548265189,0.153326370986179,25,2,2,5,0.147889891782409,0.0589638865954405,0.0235089760397912,0.414003463538894,0.634247405427742,0.365387395199715,1
+0.563622857443988,0.467359990812838,21,1,5,3,0.317670725433326,0.263414773476928,0.218425361012576,0.73218582781006,0.639414084578942,0.071506910079209,1
+0.343304847599939,0.854578266385943,22,2,1,6,0.117858218385617,0.293380861503846,0.730304013379203,0.920957236664559,1.07775346743350,0.387658506651072,1
+0.666085948701948,0.710089378990233,23,1,5,2,0.443670491058174,0.472980557667886,0.504226926154735,0.973600234805286,0.784681795257806,0.267809801016930,1
+0.190568120684475,0.0772022884339094,24,2,2,5,0.0363162086212125,0.0147122950193909,0.00596019333943254,0.205612261211838,0.813105258002736,0.523933195018469,1
+0.353534662164748,0.427994541125372,25,1,5,1,0.124986757351942,0.151310905505115,0.183179327233118,0.555127088678854,0.775304301713569,0.163208092002022,1
+0.127048352966085,0.927507144864649,21,2,1,4,0.0161412839913949,0.117838255119330,0.860269503774972,0.936168140755905,1.27370093893119,0.567322915045421,1
+0.960906301159412,0.891004979610443,22,2,3,7,0.923340919607862,0.856172299272088,0.793889873690606,1.31043152942016,0.891862204031343,0.604416671286136,1
+0.306814440060407,0.902291874401271,23,2,1,6,0.094135100629581,0.276836176215481,0.81413062661056,0.953029761990747,1.13782109627099,0.446272800849954,1
+0.087350245565176,0.671402548439801,24,2,6,4,0.00763006540029655,0.0586471774793016,0.450781382051459,0.677060889028273,1.13300968942079,0.446831795474291,1
+0.27015240653418,0.371201378758997,25,1,5,1,0.0729823227562089,0.100280945780549,0.137790463592580,0.459099974241765,0.81882108746687,0.263474858488646,1
+0.871842501685023,0.569787061074749,21,2,3,2,0.7601093477444,0.496764576755166,0.324657294968199,1.04152131169391,0.584021951079369,0.378334613738721,1
+0.686449621338397,0.169308491749689,22,2,4,10,0.471213082635629,0.116221750050949,0.0286653653785545,0.707020825728764,0.356341416814533,0.379631841296403,1
+0.67132937326096,0.571220482233912,23,1,5,2,0.450683127402953,0.383477088331915,0.326292839323543,0.881462402332905,0.659027480614106,0.185542747720368,1
+0.548616112209857,0.405350996181369,24,1,5,3,0.300979638576258,0.222382087605415,0.164309430105228,0.682121007359754,0.606676886210257,0.106404700508298,1
+0.677980388281867,0.993355110753328,25,2,3,9,0.459657406894831,0.673475283690318,0.986754376059756,1.20266860895036,1.04424662144096,0.524477152905055,1

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/resources/test-data.csv
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/resources/test-data.csv b/community/mahout-mr/mr-examples/src/main/resources/test-data.csv
new file mode 100644
index 0000000..ab683cd
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/resources/test-data.csv
@@ -0,0 +1,61 @@
+"V1","V2","V3","V4","V5","V6","V7","V8","y"
+1,-0.212887381184450,-0.955959589855826,-0.00326541907490505,0.0560086232868742,0.091264583618544,0.0172194710825328,-0.0237399208336878,1
+1,3.14702017427074,2.12881054220556,-0.00566925018709358,-0.055626039510634,-0.0630510476335515,-0.00155145331201058,0.108559859662683,0
+1,-2.16541417186635,-2.71847685293678,-0.00833554984263851,0.0433655514274994,-0.102555485096075,-0.156155728366877,-0.0241458595902909,1
+1,-4.33686585982661,-2.6857484867589,-0.0115524101901378,0.122387581992154,0.081766215557828,-0.0206167352421607,-0.0424490760296281,1
+1,2.34100936064648,2.10958510331364,-0.0129315842415535,0.173866353524092,-0.0299915285951044,0.108136400830407,-0.0063355720943443,0
+1,1.30317270786224,3.37038662087804,-0.0230504278644102,-0.131884713919903,0.086455020204179,0.17337860146005,-0.0524355492943794,0
+1,1.94943481762617,3.54806480367192,-0.029538920288902,-0.0720379027720258,0.214306548234308,-0.082665692089578,0.226607475768828,0
+1,3.14635496849369,1.76134258264267,-0.0318247859223975,-0.187198080297378,-0.08576487890296,0.153638925055934,-0.0691201521844938,0
+1,-1.26105438936697,-1.95583819596755,-0.0367826492102569,-0.0936093811581598,-0.0317225362744449,-0.0840334569992295,-0.0627566339884115,1
+1,2.40442001058194,3.23077413487565,-0.0452264569747572,0.0371989606630366,-0.17352653795031,0.102543062447842,-0.0551882772900301,0
+1,-2.20940227045733,-0.175769402031962,-0.0465958462590872,0.130789407148096,-0.140283147466875,0.0708851428212228,0.0605244763586474,1
+1,-1.64710385829030,-2.57691366099069,-0.0553070134425288,-0.0349011715152424,-0.0826092377112715,0.106766133325393,-0.0585587032435851,1
+1,-2.6523724984616,-4.16903830585265,-0.0568310036349303,-0.0291979248790545,-0.255996825268056,0.0401827924643623,0.0179311252387879,1
+1,2.34337447158977,0.28996735916551,-0.0625800583342644,0.0899232083837452,0.0255207970332586,-0.0343458209061299,0.0755898049986344,0
+1,3.67556867120403,1.36097809464341,-0.0956707962851342,0.0537771695881714,-0.0373171704803031,0.0463473815328367,-0.228499359561800,0
+1,1.96533061882493,2.92646586187099,-0.103334098736041,-0.0194013528907574,0.0253359438067293,0.00748464018133427,-0.239745502177878,0
+1,-1.95041601303593,-0.860607985906108,-0.103721968898869,-0.00972933741506002,0.0227857854969761,-0.0287381002832544,-0.130156656165122,1
+1,-1.51543545229533,-1.35683836829949,-0.106483722717291,0.103877046729912,0.00840497101030744,0.0258430051020969,0.168907472637671,1
+1,1.45074382041585,1.88231080047069,-0.107681637419817,-0.00626324733854461,-0.144385489192821,0.00088239451623517,-0.00299885969569744,0
+1,3.87956616310254,4.31276421460554,-0.129963535661731,-0.0640782960295875,-0.0324909886960640,0.0428280701443882,0.0329254937199428,0
+1,-2.88187391546093,-3.16731558128991,-0.136390769151814,-0.155408895734766,0.105626409419800,-0.0918345772196075,0.197828194781600,1
+1,-2.65024496288248,-1.81147577507541,-0.145438998990911,0.0691687502404964,0.0749439097959056,-0.0674149410216342,0.123896965825847,1
+1,-1.37426198993006,-2.08894064826135,-0.153236566384176,0.0213513951854753,-0.134553043562400,0.00287304090325258,0.0122158739075685,1
+1,1.65698424179346,2.49004336804714,-0.153862461770005,0.105220938080375,-0.0946233303225818,-0.122426312548592,-0.00538234276442917,0
+1,2.93315586503758,2.75229115279104,-0.168877592929163,-0.0349207806558679,0.0189964813847077,0.202397029441612,0.0426299706123943,0
+1,-3.84306960373604,-2.35606387141237,-0.179511886850707,-0.0916819865200809,0.0265829433229566,0.101658708455140,-0.0855390303406673,1
+1,2.28101644492271,1.37963780647481,-0.180898801743387,-0.0789829066843624,-0.0779025366072777,0.0442621459868237,-0.136195159617836,0
+1,1.70008372335953,2.71018350574622,-0.188985514267118,-0.195856534813112,-0.106263419324547,-0.0311178988395261,-0.121173036989233,0
+1,-2.05613043162767,-1.73770126734937,0.00630625444849072,-0.134595964087825,0.0708994966210059,0.0739139562742148,-0.00416084523004362,1
+1,2.39375626983328,3.2468518382106,0.00951905535238045,-0.140380515724865,0.0630970962358967,0.00183192220061040,-0.0773483294293499,0
+1,4.26863682432937,3.49421800345979,0.0109175198048448,-0.109995560295421,-0.111585866731122,0.154763193427948,-0.0186987535307691,0
+1,1.54495296452702,3.17243560853872,0.0117478311845783,0.115838636637105,-0.1715332868224,0.0927292648278796,-0.0885962242970987,0
+1,2.16883227993245,1.63879588167162,0.0158863105366749,-0.00488771308802354,0.0280782748001184,0.131946735985038,0.066416828384239,0
+1,1.86427271422921,3.32026821853873,0.0162473257475520,0.0355005599857545,-0.0988825269654524,0.0527023072810735,0.100841323212596,0
+1,-3.03828333997027,-1.43214405751321,0.0247204684728272,0.146197859364444,0.0141171187314724,-0.201738256450160,0.044002672456105,1
+1,2.08595761680696,0.225336429607513,0.0335964287149376,0.0576493862055925,0.121452048491972,0.0640240734436852,0.224720096669846,0
+1,-1.85256114614442,-2.22817393781734,0.0346230650580488,0.160185441442375,0.0114059982858295,0.00496408500928602,-0.094156048483371,1
+1,2.33572915427688,1.03334367238243,0.0357824515834720,-0.172284120406131,0.0329286256184980,-0.101030665525296,-0.00238851979619332,0
+1,-2.00334039609229,-2.98875026257892,0.0375804284421083,0.142856636546252,-0.0862220203147005,-0.0441603903572752,0.0147126239348866,1
+1,2.38346139581192,1.21051372282823,0.0405425233313353,-0.145245065311593,-0.0216697981922324,-0.0128934036902430,-0.0325085994141851,0
+1,-1.15629168023471,-1.37784639006639,0.0429948703549178,-0.00491267793152886,0.0263522850749959,-0.0442602193050815,0.0582704866256344,1
+1,2.13230915550664,1.32833684701498,0.0434112538719301,-0.0296522957829338,0.00247091583877657,-0.123872403365319,-0.136549696313901,0
+1,-1.88291252343724,-1.99980946454726,0.0472833199907535,-0.0365284873908706,-0.0209054390489622,-0.0891896486647233,0.0542966824787834,1
+1,-1.34787394136153,-2.57763619051754,0.0493154843443071,0.0384664637019124,-0.00780509859650452,-0.118550134827935,0.00573215142098708,1
+1,-1.81748193199251,-2.72113041015796,0.0551479875680516,-0.255723061179778,-0.217672946803948,0.145106553357089,0.0632886151091758,1
+1,-3.13049595715861,-0.0285946551309455,0.0724437318718333,-0.0360911974267016,-0.121364676014540,0.038351368519738,-0.0125375424386282,1
+1,-2.3836883021805,-1.40162632998805,0.0746620557343183,0.069222624188286,0.04657285528431,0.0932835769596473,0.00836816351062604,1
+1,-2.43800450243598,-0.965440038635416,0.0763675021411913,-0.122575769653323,0.045866930905471,-0.0493852614669876,0.128116802512532,1
+1,1.09024638837653,2.21814920469686,0.0769910502309598,-0.270152593833931,-0.252735856082821,0.0661674666715274,-0.000429289775969046,0
+1,3.17642151475607,1.18015379683312,0.0776648965451875,-0.117234850817615,0.0759455286430382,0.119280079276134,0.117056969569811,0
+1,-3.5501372839931,-4.02435741321994,0.0833451415432366,-0.0185864612285970,0.0553371588028254,0.0269699189958747,-0.0930023774668385,1
+1,-2.85922019599943,-2.07644295605507,0.0903467736346066,0.124804691516462,0.0673015037344841,0.0234043567104492,0.0866115903248345,1
+1,0.513249476607372,5.0165612245778,0.0934321220365115,-0.0387550539552360,0.070129320868753,0.0635055975927393,-0.00773489793089484,0
+1,1.30094323285406,2.74698316868320,0.094239413405751,-0.105600040230387,-0.0134676903839459,0.00834379403909127,0.0978349326557826,0
+1,1.62511731278249,3.01296963021698,0.104352029985773,-0.0065839083200722,0.068460830526483,-0.1202220553,0.121998460927858,0
+1,1.82917662184333,2.89388269168932,0.110781239485760,-0.262387884050666,-0.00517657837760664,-0.0224028641246511,-0.108606003593092,0
+1,-3.17279743572930,-2.86698187406046,0.110873139279243,-0.093614374710967,0.0925974010859032,-0.00747619041107016,-0.066394213442664,1
+1,-3.20104938765970,-1.68043245593876,0.123227179211642,-0.00179275501686146,-0.175893752209014,-0.0835732816974749,0.0560957582079696,1
+1,-1.89923900052239,-2.92427973445236,0.147975477003611,0.00819675018680998,0.00470753628896422,-0.0122227288860826,0.209903875101594,1
+1,0.148491843864120,-1.54734877494689,0.162479731968606,0.112962938668545,-0.0100535803565242,0.0422099301034027,0.0752974779385111,1

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/test/java/org/apache/mahout/classifier/sgd/LogisticModelParametersTest.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/test/java/org/apache/mahout/classifier/sgd/LogisticModelParametersTest.java b/community/mahout-mr/mr-examples/src/test/java/org/apache/mahout/classifier/sgd/LogisticModelParametersTest.java
new file mode 100644
index 0000000..e849011
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/test/java/org/apache/mahout/classifier/sgd/LogisticModelParametersTest.java
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import org.apache.mahout.common.MahoutTestCase;
+import org.junit.Test;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Collections;
+
+public class LogisticModelParametersTest extends MahoutTestCase {
+
+ @Test
+ public void serializationWithoutCsv() throws IOException {
+ LogisticModelParameters params = new LogisticModelParameters();
+ params.setTargetVariable("foo");
+ params.setTypeMap(Collections.<String, String>emptyMap());
+ params.setTargetCategories(Arrays.asList("foo", "bar"));
+ params.setNumFeatures(1);
+ params.createRegression();
+
+ //MAHOUT-1196 should work without "csv" being set
+ params.saveTo(new ByteArrayOutputStream());
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/test/java/org/apache/mahout/classifier/sgd/ModelDissectorTest.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/test/java/org/apache/mahout/classifier/sgd/ModelDissectorTest.java b/community/mahout-mr/mr-examples/src/test/java/org/apache/mahout/classifier/sgd/ModelDissectorTest.java
new file mode 100644
index 0000000..c8e4879
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/test/java/org/apache/mahout/classifier/sgd/ModelDissectorTest.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import org.apache.mahout.examples.MahoutTestCase;
+import org.apache.mahout.math.DenseVector;
+import org.junit.Test;
+
+public class ModelDissectorTest extends MahoutTestCase {
+ @Test
+ public void testCategoryOrdering() {
+ ModelDissector.Weight w = new ModelDissector.Weight("a", new DenseVector(new double[]{-2, -5, 5, 2, 4, 1, 0}), 4);
+ assertEquals(1, w.getCategory(0), 0);
+ assertEquals(-5, w.getWeight(0), 0);
+
+ assertEquals(2, w.getCategory(1), 0);
+ assertEquals(5, w.getWeight(1), 0);
+
+ assertEquals(4, w.getCategory(2), 0);
+ assertEquals(4, w.getWeight(2), 0);
+
+ assertEquals(0, w.getCategory(3), 0);
+ assertEquals(-2, w.getWeight(3), 0);
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/test/java/org/apache/mahout/classifier/sgd/TrainLogisticTest.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/test/java/org/apache/mahout/classifier/sgd/TrainLogisticTest.java b/community/mahout-mr/mr-examples/src/test/java/org/apache/mahout/classifier/sgd/TrainLogisticTest.java
new file mode 100644
index 0000000..4cde692
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/test/java/org/apache/mahout/classifier/sgd/TrainLogisticTest.java
@@ -0,0 +1,167 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import com.google.common.base.Charsets;
+import com.google.common.collect.ImmutableMap;
+import com.google.common.collect.Sets;
+import com.google.common.io.Resources;
+import org.apache.mahout.classifier.AbstractVectorClassifier;
+import org.apache.mahout.examples.MahoutTestCase;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.Vector;
+import org.junit.Test;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.InputStream;
+import java.io.PrintWriter;
+import java.io.StringWriter;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.TreeSet;
+
+public class TrainLogisticTest extends MahoutTestCase {
+
+ @Test
+ public void example131() throws Exception {
+ String outputFile = getTestTempFile("model").getAbsolutePath();
+
+ StringWriter sw = new StringWriter();
+ PrintWriter pw = new PrintWriter(sw, true);
+ TrainLogistic.mainToOutput(new String[]{
+ "--input", "donut.csv",
+ "--output", outputFile,
+ "--target", "color", "--categories", "2",
+ "--predictors", "x", "y",
+ "--types", "numeric",
+ "--features", "20",
+ "--passes", "100",
+ "--rate", "50"
+ }, pw);
+ String trainOut = sw.toString();
+ assertTrue(trainOut.contains("x -0.7"));
+ assertTrue(trainOut.contains("y -0.4"));
+
+ LogisticModelParameters lmp = TrainLogistic.getParameters();
+ assertEquals(1.0e-4, lmp.getLambda(), 1.0e-9);
+ assertEquals(20, lmp.getNumFeatures());
+ assertTrue(lmp.useBias());
+ assertEquals("color", lmp.getTargetVariable());
+ CsvRecordFactory csv = lmp.getCsvRecordFactory();
+ assertEquals("[1, 2]", new TreeSet<>(csv.getTargetCategories()).toString());
+ assertEquals("[Intercept Term, x, y]", Sets.newTreeSet(csv.getPredictors()).toString());
+
+ // verify model by building dissector
+ AbstractVectorClassifier model = TrainLogistic.getModel();
+ List<String> data = Resources.readLines(Resources.getResource("donut.csv"), Charsets.UTF_8);
+ Map<String, Double> expectedValues = ImmutableMap.of("x", -0.7, "y", -0.43, "Intercept Term", -0.15);
+ verifyModel(lmp, csv, data, model, expectedValues);
+
+ // test saved model
+ try (InputStream in = new FileInputStream(new File(outputFile))){
+ LogisticModelParameters lmpOut = LogisticModelParameters.loadFrom(in);
+ CsvRecordFactory csvOut = lmpOut.getCsvRecordFactory();
+ csvOut.firstLine(data.get(0));
+ OnlineLogisticRegression lrOut = lmpOut.createRegression();
+ verifyModel(lmpOut, csvOut, data, lrOut, expectedValues);
+ }
+
+ sw = new StringWriter();
+ pw = new PrintWriter(sw, true);
+ RunLogistic.mainToOutput(new String[]{
+ "--input", "donut.csv",
+ "--model", outputFile,
+ "--auc",
+ "--confusion"
+ }, pw);
+ trainOut = sw.toString();
+ assertTrue(trainOut.contains("AUC = 0.57"));
+ assertTrue(trainOut.contains("confusion: [[27.0, 13.0], [0.0, 0.0]]"));
+ }
+
+ @Test
+ public void example132() throws Exception {
+ String outputFile = getTestTempFile("model").getAbsolutePath();
+
+ StringWriter sw = new StringWriter();
+ PrintWriter pw = new PrintWriter(sw, true);
+ TrainLogistic.mainToOutput(new String[]{
+ "--input", "donut.csv",
+ "--output", outputFile,
+ "--target", "color",
+ "--categories", "2",
+ "--predictors", "x", "y", "a", "b", "c",
+ "--types", "numeric",
+ "--features", "20",
+ "--passes", "100",
+ "--rate", "50"
+ }, pw);
+
+ String trainOut = sw.toString();
+ assertTrue(trainOut.contains("a 0."));
+ assertTrue(trainOut.contains("b -1."));
+ assertTrue(trainOut.contains("c -25."));
+
+ sw = new StringWriter();
+ pw = new PrintWriter(sw, true);
+ RunLogistic.mainToOutput(new String[]{
+ "--input", "donut.csv",
+ "--model", outputFile,
+ "--auc",
+ "--confusion"
+ }, pw);
+ trainOut = sw.toString();
+ assertTrue(trainOut.contains("AUC = 1.00"));
+
+ sw = new StringWriter();
+ pw = new PrintWriter(sw, true);
+ RunLogistic.mainToOutput(new String[]{
+ "--input", "donut-test.csv",
+ "--model", outputFile,
+ "--auc",
+ "--confusion"
+ }, pw);
+ trainOut = sw.toString();
+ assertTrue(trainOut.contains("AUC = 0.9"));
+ }
+
+ private static void verifyModel(LogisticModelParameters lmp,
+ RecordFactory csv,
+ List<String> data,
+ AbstractVectorClassifier model,
+ Map<String, Double> expectedValues) {
+ ModelDissector md = new ModelDissector();
+ for (String line : data.subList(1, data.size())) {
+ Vector v = new DenseVector(lmp.getNumFeatures());
+ csv.getTraceDictionary().clear();
+ csv.processLine(line, v);
+ md.update(v, csv.getTraceDictionary(), model);
+ }
+
+ // check right variables are present
+ List<ModelDissector.Weight> weights = md.summary(10);
+ Set<String> expected = Sets.newHashSet(expectedValues.keySet());
+ for (ModelDissector.Weight weight : weights) {
+ assertTrue(expected.remove(weight.getFeature()));
+ assertEquals(expectedValues.get(weight.getFeature()), weight.getWeight(), 0.1);
+ }
+ assertEquals(0, expected.size());
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/test/java/org/apache/mahout/clustering/display/ClustersFilterTest.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/test/java/org/apache/mahout/clustering/display/ClustersFilterTest.java b/community/mahout-mr/mr-examples/src/test/java/org/apache/mahout/clustering/display/ClustersFilterTest.java
new file mode 100644
index 0000000..6e43b97
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/test/java/org/apache/mahout/clustering/display/ClustersFilterTest.java
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.display;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.PathFilter;
+import org.apache.mahout.common.MahoutTestCase;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.io.IOException;
+
+public class ClustersFilterTest extends MahoutTestCase {
+
+ private Configuration configuration;
+ private Path output;
+
+ @Override
+ @Before
+ public void setUp() throws Exception {
+ super.setUp();
+ configuration = getConfiguration();
+ output = getTestTempDirPath();
+ }
+
+ @Test
+ public void testAcceptNotFinal() throws Exception {
+ Path path0 = new Path(output, "clusters-0");
+ Path path1 = new Path(output, "clusters-1");
+
+ path0.getFileSystem(configuration).createNewFile(path0);
+ path1.getFileSystem(configuration).createNewFile(path1);
+
+ PathFilter clustersFilter = new ClustersFilter();
+
+ assertTrue(clustersFilter.accept(path0));
+ assertTrue(clustersFilter.accept(path1));
+ }
+
+ @Test
+ public void testAcceptFinalPath() throws IOException {
+ Path path0 = new Path(output, "clusters-0");
+ Path path1 = new Path(output, "clusters-1");
+ Path path2 = new Path(output, "clusters-2");
+ Path path3Final = new Path(output, "clusters-3-final");
+
+ path0.getFileSystem(configuration).createNewFile(path0);
+ path1.getFileSystem(configuration).createNewFile(path1);
+ path2.getFileSystem(configuration).createNewFile(path2);
+ path3Final.getFileSystem(configuration).createNewFile(path3Final);
+
+ PathFilter clustersFilter = new ClustersFilter();
+
+ assertTrue(clustersFilter.accept(path0));
+ assertTrue(clustersFilter.accept(path1));
+ assertTrue(clustersFilter.accept(path2));
+ assertTrue(clustersFilter.accept(path3Final));
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/test/java/org/apache/mahout/examples/MahoutTestCase.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/test/java/org/apache/mahout/examples/MahoutTestCase.java b/community/mahout-mr/mr-examples/src/test/java/org/apache/mahout/examples/MahoutTestCase.java
new file mode 100644
index 0000000..4d81e3f
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/test/java/org/apache/mahout/examples/MahoutTestCase.java
@@ -0,0 +1,30 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.examples;
+
+/**
+ * This class should not exist. It's here to work around some bizarre problem in Maven
+ * dependency management wherein it can see methods in {@link org.apache.mahout.common.MahoutTestCase}
+ * but not constants. Duplicated here to make it jive.
+ */
+public abstract class MahoutTestCase extends org.apache.mahout.common.MahoutTestCase {
+
+ /** "Close enough" value for floating-point comparisons. */
+ public static final double EPSILON = 0.000001;
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/test/resources/country.txt
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/test/resources/country.txt b/community/mahout-mr/mr-examples/src/test/resources/country.txt
new file mode 100644
index 0000000..6a22091
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/test/resources/country.txt
@@ -0,0 +1,229 @@
+Afghanistan
+Albania
+Algeria
+American Samoa
+Andorra
+Angola
+Anguilla
+Antigua and Barbuda
+Argentina
+Armenia
+Aruba
+Australia
+Austria
+Azerbaijan
+Bahamas
+Bangladesh
+Barbados
+Belarus
+Belgium
+Belize
+Benin
+Bermuda
+Bhutan
+Bolivia
+Bosnia and Herzegovina
+Botswana
+Bouvet Island
+Brazil
+British Indian Ocean Territory
+Brunei Darussalam
+Bulgaria
+Burkina Faso
+Burundi
+Cambodia
+Cameroon
+Canada
+Cape Verde
+Cayman Islands
+Central African Republic
+Chad
+Chile
+China
+Christmas Island
+Cocos Islands
+Colombia
+Comoros
+Congo
+Cook Islands
+Costa Rica
+Croatia
+C�te d'Ivoire
+Cuba
+Cyprus
+Czech Republic
+Djibouti
+Dominica
+Dominican Republic
+Ecuador
+Egypt
+El Salvador
+Equatorial Guinea
+Eritrea
+Estonia
+Ethiopia
+Falkland Islands
+Faroe Islands
+Fiji
+Finland
+France
+French Guiana
+French Polynesia
+French Southern Territories
+Gabon
+Georgia
+Germany
+Ghana
+Gibraltar
+Greece
+Greenland
+Grenada
+Guadeloupe
+Guam
+Guatemala
+Guernsey
+Guinea
+Guinea-Bissau
+Guyana
+Haiti
+Honduras
+Hong Kong
+Hungary
+Iceland
+India
+Indonesia
+Iran
+Iraq
+Ireland
+Isle of Man
+Israel
+Italy
+Japan
+Jersey
+Jordan
+Kazakhstan
+Kenya
+Kiribati
+Korea
+Kuwait
+Kyrgyzstan
+Latvia
+Lebanon
+Lesotho
+Liberia
+Liechtenstein
+Lithuania
+Luxembourg
+Macedonia
+Madagascar
+Malawi
+Malaysia
+Maldives
+Mali
+Malta
+Marshall Islands
+Martinique
+Mauritania
+Mauritius
+Mayotte
+Mexico
+Micronesia
+Moldova
+Monaco
+Mongolia
+Montenegro
+Montserrat
+Morocco
+Mozambique
+Myanmar
+Namibia
+Nauru
+Nepal
+Netherlands
+Netherlands Antilles
+New Caledonia
+New Zealand
+Nicaragua
+Niger
+Nigeria
+Niue
+Norfolk Island
+Northern Mariana Islands
+Norway
+Oman
+Pakistan
+Palau
+Palestinian Territory
+Panama
+Papua New Guinea
+Paraguay
+Peru
+Philippines
+Pitcairn
+Poland
+Portugal
+Puerto Rico
+Qatar
+R�union
+Russian Federation
+Rwanda
+Saint Barth�lemy
+Saint Helena
+Saint Kitts and Nevis
+Saint Lucia
+Saint Martin
+Saint Pierre and Miquelon
+Saint Vincent and the Grenadines
+Samoa
+San Marino
+Sao Tome and Principe
+Saudi Arabia
+Senegal
+Serbia
+Seychelles
+Sierra Leone
+Singapore
+Slovakia
+Slovenia
+Solomon Islands
+Somalia
+South Africa
+South Georgia and the South Sandwich Islands
+Spain
+Sri Lanka
+Sudan
+Suriname
+Svalbard and Jan Mayen
+Swaziland
+Sweden
+Switzerland
+Syrian Arab Republic
+Taiwan
+Tanzania
+Thailand
+Timor-Leste
+Togo
+Tokelau
+Tonga
+Trinidad and Tobago
+Tunisia
+Turkey
+Turkmenistan
+Turks and Caicos Islands
+Tuvalu
+Ukraine
+United Arab Emirates
+United Kingdom
+United States
+United States Minor Outlying Islands
+Uruguay
+Uzbekistan
+Vanuatu
+Vatican
+Venezuela
+Vietnam
+Virgin Islands
+Wallis and Futuna
+Yemen
+Zambia
+Zimbabwe

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/test/resources/country10.txt
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/test/resources/country10.txt b/community/mahout-mr/mr-examples/src/test/resources/country10.txt
new file mode 100644
index 0000000..97a63e1
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/test/resources/country10.txt
@@ -0,0 +1,10 @@
+Australia
+Austria
+Bahamas
+Canada
+Colombia
+Cuba
+Panama
+Pakistan
+United Kingdom
+Vietnam

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/test/resources/country2.txt
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/test/resources/country2.txt b/community/mahout-mr/mr-examples/src/test/resources/country2.txt
new file mode 100644
index 0000000..f4b4f61
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/test/resources/country2.txt
@@ -0,0 +1,2 @@
+United States
+United Kingdom

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/test/resources/subjects.txt
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/test/resources/subjects.txt b/community/mahout-mr/mr-examples/src/test/resources/subjects.txt
new file mode 100644
index 0000000..f52ae33
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/test/resources/subjects.txt
@@ -0,0 +1,2 @@
+Science
+History

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/test/resources/wdbc.infos
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/test/resources/wdbc.infos b/community/mahout-mr/mr-examples/src/test/resources/wdbc.infos
new file mode 100644
index 0000000..94a63d6
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/test/resources/wdbc.infos
@@ -0,0 +1,32 @@
+IGNORED
+LABEL, B, M
+NUMERICAL, 6.9, 28.2
+NUMERICAL, 9.7, 39.3
+NUMERICAL, 43.7, 188.5
+NUMERICAL, 143.5, 2501.0
+NUMERICAL, 0.0, 0.2
+NUMERICAL, 0.0, 0.4
+NUMERICAL, 0.0, 0.5
+NUMERICAL, 0.0, 0.3
+NUMERICAL, 0.1, 0.4
+NUMERICAL, 0.0, 0.1
+NUMERICAL, 0.1, 2.9
+NUMERICAL, 0.3, 4.9
+NUMERICAL, 0.7, 22.0
+NUMERICAL, 6.8, 542.3
+NUMERICAL, 0.0, 0.1
+NUMERICAL, 0.0, 0.2
+NUMERICAL, 0.0, 0.4
+NUMERICAL, 0.0, 0.1
+NUMERICAL, 0.0, 0.1
+NUMERICAL, 0.0, 0.1
+NUMERICAL, 7.9, 36.1
+NUMERICAL, 12.0, 49.6
+NUMERICAL, 50.4, 251.2
+NUMERICAL, 185.2, 4254.0
+NUMERICAL, 0.0, 0.3
+NUMERICAL, 0.0, 1.1
+NUMERICAL, 0.0, 1.3
+NUMERICAL, 0.0, 0.3
+NUMERICAL, 0.1, 0.7
+NUMERICAL, 0.0, 0.3
r***@apache.org
2018-06-28 14:54:50 UTC
Permalink
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/OrderBasedRecommenderEvaluator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/OrderBasedRecommenderEvaluator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/OrderBasedRecommenderEvaluator.java
new file mode 100644
index 0000000..e267a39
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/OrderBasedRecommenderEvaluator.java
@@ -0,0 +1,431 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.eval;
+
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+import org.apache.mahout.cf.taste.impl.common.RunningAverage;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.cf.taste.recommender.RecommendedItem;
+import org.apache.mahout.cf.taste.recommender.Recommender;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Evaluate recommender by comparing order of all raw prefs with order in
+ * recommender's output for that user. Can also compare data models.
+ */
+public final class OrderBasedRecommenderEvaluator {
+
+ private static final Logger log = LoggerFactory.getLogger(OrderBasedRecommenderEvaluator.class);
+
+ private OrderBasedRecommenderEvaluator() {
+ }
+
+ public static void evaluate(Recommender recommender1,
+ Recommender recommender2,
+ int samples,
+ RunningAverage tracker,
+ String tag) throws TasteException {
+ printHeader();
+ LongPrimitiveIterator users = recommender1.getDataModel().getUserIDs();
+
+ while (users.hasNext()) {
+ long userID = users.nextLong();
+ List<RecommendedItem> recs1 = recommender1.recommend(userID, samples);
+ List<RecommendedItem> recs2 = recommender2.recommend(userID, samples);
+ FastIDSet commonSet = new FastIDSet();
+ long maxItemID = setBits(commonSet, recs1, samples);
+ FastIDSet otherSet = new FastIDSet();
+ maxItemID = Math.max(maxItemID, setBits(otherSet, recs2, samples));
+ int max = mask(commonSet, otherSet, maxItemID);
+ max = Math.min(max, samples);
+ if (max < 2) {
+ continue;
+ }
+ Long[] items1 = getCommonItems(commonSet, recs1, max);
+ Long[] items2 = getCommonItems(commonSet, recs2, max);
+ double variance = scoreCommonSubset(tag, userID, samples, max, items1, items2);
+ tracker.addDatum(variance);
+ }
+ }
+
+ public static void evaluate(Recommender recommender,
+ DataModel model,
+ int samples,
+ RunningAverage tracker,
+ String tag) throws TasteException {
+ printHeader();
+ LongPrimitiveIterator users = recommender.getDataModel().getUserIDs();
+ while (users.hasNext()) {
+ long userID = users.nextLong();
+ List<RecommendedItem> recs1 = recommender.recommend(userID, model.getNumItems());
+ PreferenceArray prefs2 = model.getPreferencesFromUser(userID);
+ prefs2.sortByValueReversed();
+ FastIDSet commonSet = new FastIDSet();
+ long maxItemID = setBits(commonSet, recs1, samples);
+ FastIDSet otherSet = new FastIDSet();
+ maxItemID = Math.max(maxItemID, setBits(otherSet, prefs2, samples));
+ int max = mask(commonSet, otherSet, maxItemID);
+ max = Math.min(max, samples);
+ if (max < 2) {
+ continue;
+ }
+ Long[] items1 = getCommonItems(commonSet, recs1, max);
+ Long[] items2 = getCommonItems(commonSet, prefs2, max);
+ double variance = scoreCommonSubset(tag, userID, samples, max, items1, items2);
+ tracker.addDatum(variance);
+ }
+ }
+
+ public static void evaluate(DataModel model1,
+ DataModel model2,
+ int samples,
+ RunningAverage tracker,
+ String tag) throws TasteException {
+ printHeader();
+ LongPrimitiveIterator users = model1.getUserIDs();
+ while (users.hasNext()) {
+ long userID = users.nextLong();
+ PreferenceArray prefs1 = model1.getPreferencesFromUser(userID);
+ PreferenceArray prefs2 = model2.getPreferencesFromUser(userID);
+ prefs1.sortByValueReversed();
+ prefs2.sortByValueReversed();
+ FastIDSet commonSet = new FastIDSet();
+ long maxItemID = setBits(commonSet, prefs1, samples);
+ FastIDSet otherSet = new FastIDSet();
+ maxItemID = Math.max(maxItemID, setBits(otherSet, prefs2, samples));
+ int max = mask(commonSet, otherSet, maxItemID);
+ max = Math.min(max, samples);
+ if (max < 2) {
+ continue;
+ }
+ Long[] items1 = getCommonItems(commonSet, prefs1, max);
+ Long[] items2 = getCommonItems(commonSet, prefs2, max);
+ double variance = scoreCommonSubset(tag, userID, samples, max, items1, items2);
+ tracker.addDatum(variance);
+ }
+ }
+
+ /**
+ * This exists because FastIDSet has 'retainAll' as MASK, but there is
+ * no count of the number of items in the set. size() is supposed to do
+ * this but does not work.
+ */
+ private static int mask(FastIDSet commonSet, FastIDSet otherSet, long maxItemID) {
+ int count = 0;
+ for (int i = 0; i <= maxItemID; i++) {
+ if (commonSet.contains(i)) {
+ if (otherSet.contains(i)) {
+ count++;
+ } else {
+ commonSet.remove(i);
+ }
+ }
+ }
+ return count;
+ }
+
+ private static Long[] getCommonItems(FastIDSet commonSet, Iterable<RecommendedItem> recs, int max) {
+ Long[] commonItems = new Long[max];
+ int index = 0;
+ for (RecommendedItem rec : recs) {
+ Long item = rec.getItemID();
+ if (commonSet.contains(item)) {
+ commonItems[index++] = item;
+ }
+ if (index == max) {
+ break;
+ }
+ }
+ return commonItems;
+ }
+
+ private static Long[] getCommonItems(FastIDSet commonSet, PreferenceArray prefs1, int max) {
+ Long[] commonItems = new Long[max];
+ int index = 0;
+ for (int i = 0; i < prefs1.length(); i++) {
+ Long item = prefs1.getItemID(i);
+ if (commonSet.contains(item)) {
+ commonItems[index++] = item;
+ }
+ if (index == max) {
+ break;
+ }
+ }
+ return commonItems;
+ }
+
+ private static long setBits(FastIDSet modelSet, List<RecommendedItem> items, int max) {
+ long maxItem = -1;
+ for (int i = 0; i < items.size() && i < max; i++) {
+ long itemID = items.get(i).getItemID();
+ modelSet.add(itemID);
+ if (itemID > maxItem) {
+ maxItem = itemID;
+ }
+ }
+ return maxItem;
+ }
+
+ private static long setBits(FastIDSet modelSet, PreferenceArray prefs, int max) {
+ long maxItem = -1;
+ for (int i = 0; i < prefs.length() && i < max; i++) {
+ long itemID = prefs.getItemID(i);
+ modelSet.add(itemID);
+ if (itemID > maxItem) {
+ maxItem = itemID;
+ }
+ }
+ return maxItem;
+ }
+
+ private static void printHeader() {
+ log.info("tag,user,samples,common,hamming,bubble,rank,normal,score");
+ }
+
+ /**
+ * Common Subset Scoring
+ *
+ * These measurements are given the set of results that are common to both
+ * recommendation lists. They only get ordered lists.
+ *
+ * These measures all return raw numbers do not correlate among the tests.
+ * The numbers are not corrected against the total number of samples or the
+ * number of common items.
+ * The one contract is that all measures are 0 for an exact match and an
+ * increasing positive number as differences increase.
+ */
+ private static double scoreCommonSubset(String tag,
+ long userID,
+ int samples,
+ int subset,
+ Long[] itemsL,
+ Long[] itemsR) {
+ int[] vectorZ = new int[subset];
+ int[] vectorZabs = new int[subset];
+
+ long bubble = sort(itemsL, itemsR);
+ int hamming = slidingWindowHamming(itemsR, itemsL);
+ if (hamming > samples) {
+ throw new IllegalStateException();
+ }
+ getVectorZ(itemsR, itemsL, vectorZ, vectorZabs);
+ double normalW = normalWilcoxon(vectorZ, vectorZabs);
+ double meanRank = getMeanRank(vectorZabs);
+ // case statement for requested value
+ double variance = Math.sqrt(meanRank);
+ log.info("{},{},{},{},{},{},{},{},{}",
+ tag, userID, samples, subset, hamming, bubble, meanRank, normalW, variance);
+ return variance;
+ }
+
+ // simple sliding-window hamming distance: a[i or plus/minus 1] == b[i]
+ private static int slidingWindowHamming(Long[] itemsR, Long[] itemsL) {
+ int count = 0;
+ int samples = itemsR.length;
+
+ if (itemsR[0].equals(itemsL[0]) || itemsR[0].equals(itemsL[1])) {
+ count++;
+ }
+ for (int i = 1; i < samples - 1; i++) {
+ long itemID = itemsL[i];
+ if (itemsR[i] == itemID || itemsR[i - 1] == itemID || itemsR[i + 1] == itemID) {
+ count++;
+ }
+ }
+ if (itemsR[samples - 1].equals(itemsL[samples - 1]) || itemsR[samples - 1].equals(itemsL[samples - 2])) {
+ count++;
+ }
+ return count;
+ }
+
+ /**
+ * Normal-distribution probability value for matched sets of values.
+ * Based upon:
+ * http://comp9.psych.cornell.edu/Darlington/normscor.htm
+ *
+ * The Standard Wilcoxon is not used because it requires a lookup table.
+ */
+ static double normalWilcoxon(int[] vectorZ, int[] vectorZabs) {
+ int nitems = vectorZ.length;
+
+ double[] ranks = new double[nitems];
+ double[] ranksAbs = new double[nitems];
+ wilcoxonRanks(vectorZ, vectorZabs, ranks, ranksAbs);
+ return Math.min(getMeanWplus(ranks), getMeanWminus(ranks));
+ }
+
+ /**
+ * vector Z is a list of distances between the correct value and the recommended value
+ * Z[i] = position i of correct itemID - position of correct itemID in recommendation list
+ * can be positive or negative
+ * the smaller the better - means recommendations are closer
+ * both are the same length, and both sample from the same set
+ *
+ * destructive to items arrays - allows N log N instead of N^2 order
+ */
+ private static void getVectorZ(Long[] itemsR, Long[] itemsL, int[] vectorZ, int[] vectorZabs) {
+ int nitems = itemsR.length;
+ int bottom = 0;
+ int top = nitems - 1;
+ for (int i = 0; i < nitems; i++) {
+ long itemID = itemsR[i];
+ for (int j = bottom; j <= top; j++) {
+ if (itemsL[j] == null) {
+ continue;
+ }
+ long test = itemsL[j];
+ if (itemID == test) {
+ vectorZ[i] = i - j;
+ vectorZabs[i] = Math.abs(i - j);
+ if (j == bottom) {
+ bottom++;
+ } else if (j == top) {
+ top--;
+ } else {
+ itemsL[j] = null;
+ }
+ break;
+ }
+ }
+ }
+ }
+
+ /**
+ * Ranks are the position of the value from low to high, divided by the # of values.
+ * I had to walk through it a few times.
+ */
+ private static void wilcoxonRanks(int[] vectorZ, int[] vectorZabs, double[] ranks, double[] ranksAbs) {
+ int nitems = vectorZ.length;
+ int[] sorted = vectorZabs.clone();
+ Arrays.sort(sorted);
+ int zeros = 0;
+ for (; zeros < nitems; zeros++) {
+ if (sorted[zeros] > 0) {
+ break;
+ }
+ }
+ for (int i = 0; i < nitems; i++) {
+ double rank = 0.0;
+ int count = 0;
+ int score = vectorZabs[i];
+ for (int j = 0; j < nitems; j++) {
+ if (score == sorted[j]) {
+ rank += j + 1 - zeros;
+ count++;
+ } else if (score < sorted[j]) {
+ break;
+ }
+ }
+ if (vectorZ[i] != 0) {
+ ranks[i] = (rank / count) * (vectorZ[i] < 0 ? -1 : 1); // better be at least 1
+ ranksAbs[i] = Math.abs(ranks[i]);
+ }
+ }
+ }
+
+ private static double getMeanRank(int[] ranks) {
+ int nitems = ranks.length;
+ double sum = 0.0;
+ for (int rank : ranks) {
+ sum += rank;
+ }
+ return sum / nitems;
+ }
+
+ private static double getMeanWplus(double[] ranks) {
+ int nitems = ranks.length;
+ double sum = 0.0;
+ for (double rank : ranks) {
+ if (rank > 0) {
+ sum += rank;
+ }
+ }
+ return sum / nitems;
+ }
+
+ private static double getMeanWminus(double[] ranks) {
+ int nitems = ranks.length;
+ double sum = 0.0;
+ for (double rank : ranks) {
+ if (rank < 0) {
+ sum -= rank;
+ }
+ }
+ return sum / nitems;
+ }
+
+ /**
+ * Do bubble sort and return number of swaps needed to match preference lists.
+ * Sort itemsR using itemsL as the reference order.
+ */
+ static long sort(Long[] itemsL, Long[] itemsR) {
+ int length = itemsL.length;
+ if (length < 2) {
+ return 0;
+ }
+ if (length == 2) {
+ return itemsL[0].longValue() == itemsR[0].longValue() ? 0 : 1;
+ }
+ // 1) avoid changing originals; 2) primitive type is more efficient
+ long[] reference = new long[length];
+ long[] sortable = new long[length];
+ for (int i = 0; i < length; i++) {
+ reference[i] = itemsL[i];
+ sortable[i] = itemsR[i];
+ }
+ int sorted = 0;
+ long swaps = 0;
+ while (sorted < length - 1) {
+ // opportunistically trim back the top
+ while (length > 0 && reference[length - 1] == sortable[length - 1]) {
+ length--;
+ }
+ if (length == 0) {
+ break;
+ }
+ if (reference[sorted] == sortable[sorted]) {
+ sorted++;
+ } else {
+ for (int j = sorted; j < length - 1; j++) {
+ // do not swap anything already in place
+ int jump = 1;
+ if (reference[j] == sortable[j]) {
+ while (j + jump < length && reference[j + jump] == sortable[j + jump]) {
+ jump++;
+ }
+ }
+ if (j + jump < length && !(reference[j] == sortable[j] && reference[j + jump] == sortable[j + jump])) {
+ long tmp = sortable[j];
+ sortable[j] = sortable[j + 1];
+ sortable[j + 1] = tmp;
+ swaps++;
+ }
+ }
+ }
+ }
+ return swaps;
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/RMSRecommenderEvaluator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/RMSRecommenderEvaluator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/RMSRecommenderEvaluator.java
new file mode 100644
index 0000000..97eda10
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/RMSRecommenderEvaluator.java
@@ -0,0 +1,56 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.eval;
+
+import org.apache.mahout.cf.taste.impl.common.FullRunningAverage;
+import org.apache.mahout.cf.taste.impl.common.RunningAverage;
+import org.apache.mahout.cf.taste.model.Preference;
+
+/**
+ * <p>
+ * A {@link org.apache.mahout.cf.taste.eval.RecommenderEvaluator} which computes the "root mean squared"
+ * difference between predicted and actual ratings for users. This is the square root of the average of this
+ * difference, squared.
+ * </p>
+ */
+public final class RMSRecommenderEvaluator extends AbstractDifferenceRecommenderEvaluator {
+
+ private RunningAverage average;
+
+ @Override
+ protected void reset() {
+ average = new FullRunningAverage();
+ }
+
+ @Override
+ protected void processOneEstimate(float estimatedPreference, Preference realPref) {
+ double diff = realPref.getValue() - estimatedPreference;
+ average.addDatum(diff * diff);
+ }
+
+ @Override
+ protected double computeFinalEvaluation() {
+ return Math.sqrt(average.getAverage());
+ }
+
+ @Override
+ public String toString() {
+ return "RMSRecommenderEvaluator";
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/StatsCallable.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/StatsCallable.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/StatsCallable.java
new file mode 100644
index 0000000..036d0b4
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/StatsCallable.java
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.eval;
+
+import org.apache.mahout.cf.taste.impl.common.RunningAverageAndStdDev;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.concurrent.Callable;
+import java.util.concurrent.atomic.AtomicInteger;
+
+final class StatsCallable implements Callable<Void> {
+
+ private static final Logger log = LoggerFactory.getLogger(StatsCallable.class);
+
+ private final Callable<Void> delegate;
+ private final boolean logStats;
+ private final RunningAverageAndStdDev timing;
+ private final AtomicInteger noEstimateCounter;
+
+ StatsCallable(Callable<Void> delegate,
+ boolean logStats,
+ RunningAverageAndStdDev timing,
+ AtomicInteger noEstimateCounter) {
+ this.delegate = delegate;
+ this.logStats = logStats;
+ this.timing = timing;
+ this.noEstimateCounter = noEstimateCounter;
+ }
+
+ @Override
+ public Void call() throws Exception {
+ long start = System.currentTimeMillis();
+ delegate.call();
+ long end = System.currentTimeMillis();
+ timing.addDatum(end - start);
+ if (logStats) {
+ Runtime runtime = Runtime.getRuntime();
+ int average = (int) timing.getAverage();
+ log.info("Average time per recommendation: {}ms", average);
+ long totalMemory = runtime.totalMemory();
+ long memory = totalMemory - runtime.freeMemory();
+ log.info("Approximate memory used: {}MB / {}MB", memory / 1000000L, totalMemory / 1000000L);
+ log.info("Unable to recommend in {} cases", noEstimateCounter.get());
+ }
+ return null;
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/AbstractDataModel.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/AbstractDataModel.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/AbstractDataModel.java
new file mode 100644
index 0000000..a1a2a1f
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/AbstractDataModel.java
@@ -0,0 +1,53 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.model;
+
+import org.apache.mahout.cf.taste.model.DataModel;
+
+/**
+ * Contains some features common to all implementations.
+ */
+public abstract class AbstractDataModel implements DataModel {
+
+ private float maxPreference;
+ private float minPreference;
+
+ protected AbstractDataModel() {
+ maxPreference = Float.NaN;
+ minPreference = Float.NaN;
+ }
+
+ @Override
+ public float getMaxPreference() {
+ return maxPreference;
+ }
+
+ protected void setMaxPreference(float maxPreference) {
+ this.maxPreference = maxPreference;
+ }
+
+ @Override
+ public float getMinPreference() {
+ return minPreference;
+ }
+
+ protected void setMinPreference(float minPreference) {
+ this.minPreference = minPreference;
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/AbstractIDMigrator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/AbstractIDMigrator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/AbstractIDMigrator.java
new file mode 100644
index 0000000..6efa6fa
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/AbstractIDMigrator.java
@@ -0,0 +1,66 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.model;
+
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
+import java.util.Collection;
+
+import org.apache.commons.io.Charsets;
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.model.IDMigrator;
+
+public abstract class AbstractIDMigrator implements IDMigrator {
+
+ private final MessageDigest md5Digest;
+
+ protected AbstractIDMigrator() {
+ try {
+ md5Digest = MessageDigest.getInstance("MD5");
+ } catch (NoSuchAlgorithmException nsae) {
+ // Can't happen
+ throw new IllegalStateException(nsae);
+ }
+ }
+
+ /**
+ * @return most significant 8 bytes of the MD5 hash of the string, as a long
+ */
+ protected final long hash(String value) {
+ byte[] md5hash;
+ synchronized (md5Digest) {
+ md5hash = md5Digest.digest(value.getBytes(Charsets.UTF_8));
+ md5Digest.reset();
+ }
+ long hash = 0L;
+ for (int i = 0; i < 8; i++) {
+ hash = hash << 8 | md5hash[i] & 0x00000000000000FFL;
+ }
+ return hash;
+ }
+
+ @Override
+ public long toLongID(String stringID) {
+ return hash(stringID);
+ }
+
+ @Override
+ public void refresh(Collection<Refreshable> alreadyRefreshed) {
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/AbstractJDBCIDMigrator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/AbstractJDBCIDMigrator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/AbstractJDBCIDMigrator.java
new file mode 100644
index 0000000..cd3a434
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/AbstractJDBCIDMigrator.java
@@ -0,0 +1,108 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.model;
+
+import java.sql.Connection;
+import java.sql.PreparedStatement;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+
+import javax.sql.DataSource;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.model.UpdatableIDMigrator;
+import org.apache.mahout.common.IOUtils;
+
+/**
+ * Implementation which stores the reverse long-to-String mapping in a database. Subclasses can override and
+ * configure the class to operate with particular databases by supplying appropriate SQL statements to the
+ * constructor.
+ */
+public abstract class AbstractJDBCIDMigrator extends AbstractIDMigrator implements UpdatableIDMigrator {
+
+ public static final String DEFAULT_MAPPING_TABLE = "taste_id_mapping";
+ public static final String DEFAULT_LONG_ID_COLUMN = "long_id";
+ public static final String DEFAULT_STRING_ID_COLUMN = "string_id";
+
+ private final DataSource dataSource;
+ private final String getStringIDSQL;
+ private final String storeMappingSQL;
+
+ /**
+ * @param getStringIDSQL
+ * SQL statement which selects one column, the String ID, from a mapping table. The statement
+ * should take one long parameter.
+ * @param storeMappingSQL
+ * SQL statement which saves a mapping from long to String. It should take two parameters, a long
+ * and a String.
+ */
+ protected AbstractJDBCIDMigrator(DataSource dataSource, String getStringIDSQL, String storeMappingSQL) {
+ this.dataSource = dataSource;
+ this.getStringIDSQL = getStringIDSQL;
+ this.storeMappingSQL = storeMappingSQL;
+ }
+
+ @Override
+ public final void storeMapping(long longID, String stringID) throws TasteException {
+ Connection conn = null;
+ PreparedStatement stmt = null;
+ try {
+ conn = dataSource.getConnection();
+ stmt = conn.prepareStatement(storeMappingSQL);
+ stmt.setLong(1, longID);
+ stmt.setString(2, stringID);
+ stmt.executeUpdate();
+ } catch (SQLException sqle) {
+ throw new TasteException(sqle);
+ } finally {
+ IOUtils.quietClose(null, stmt, conn);
+ }
+ }
+
+ @Override
+ public final String toStringID(long longID) throws TasteException {
+ Connection conn = null;
+ PreparedStatement stmt = null;
+ ResultSet rs = null;
+ try {
+ conn = dataSource.getConnection();
+ stmt = conn.prepareStatement(getStringIDSQL, ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY);
+ stmt.setFetchDirection(ResultSet.FETCH_FORWARD);
+ stmt.setFetchSize(1);
+ stmt.setLong(1, longID);
+ rs = stmt.executeQuery();
+ if (rs.next()) {
+ return rs.getString(1);
+ } else {
+ return null;
+ }
+ } catch (SQLException sqle) {
+ throw new TasteException(sqle);
+ } finally {
+ IOUtils.quietClose(rs, stmt, conn);
+ }
+ }
+
+ @Override
+ public void initialize(Iterable<String> stringIDs) throws TasteException {
+ for (String stringID : stringIDs) {
+ storeMapping(toLongID(stringID), stringID);
+ }
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/BooleanItemPreferenceArray.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/BooleanItemPreferenceArray.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/BooleanItemPreferenceArray.java
new file mode 100644
index 0000000..6db5807
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/BooleanItemPreferenceArray.java
@@ -0,0 +1,234 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.model;
+
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.List;
+
+import com.google.common.base.Function;
+import com.google.common.collect.Iterators;
+import org.apache.mahout.cf.taste.model.Preference;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.common.iterator.CountingIterator;
+
+/**
+ * <p>
+ * Like {@link BooleanUserPreferenceArray} but stores preferences for one item (all item IDs the same) rather
+ * than one user.
+ * </p>
+ *
+ * @see BooleanPreference
+ * @see BooleanUserPreferenceArray
+ * @see GenericItemPreferenceArray
+ */
+public final class BooleanItemPreferenceArray implements PreferenceArray {
+
+ private final long[] ids;
+ private long id;
+
+ public BooleanItemPreferenceArray(int size) {
+ this.ids = new long[size];
+ this.id = Long.MIN_VALUE; // as a sort of 'unspecified' value
+ }
+
+ public BooleanItemPreferenceArray(List<? extends Preference> prefs, boolean forOneUser) {
+ this(prefs.size());
+ int size = prefs.size();
+ for (int i = 0; i < size; i++) {
+ Preference pref = prefs.get(i);
+ ids[i] = forOneUser ? pref.getItemID() : pref.getUserID();
+ }
+ if (size > 0) {
+ id = forOneUser ? prefs.get(0).getUserID() : prefs.get(0).getItemID();
+ }
+ }
+
+ /**
+ * This is a private copy constructor for clone().
+ */
+ private BooleanItemPreferenceArray(long[] ids, long id) {
+ this.ids = ids;
+ this.id = id;
+ }
+
+ @Override
+ public int length() {
+ return ids.length;
+ }
+
+ @Override
+ public Preference get(int i) {
+ return new PreferenceView(i);
+ }
+
+ @Override
+ public void set(int i, Preference pref) {
+ id = pref.getItemID();
+ ids[i] = pref.getUserID();
+ }
+
+ @Override
+ public long getUserID(int i) {
+ return ids[i];
+ }
+
+ @Override
+ public void setUserID(int i, long userID) {
+ ids[i] = userID;
+ }
+
+ @Override
+ public long getItemID(int i) {
+ return id;
+ }
+
+ /**
+ * {@inheritDoc}
+ *
+ * Note that this method will actually set the item ID for <em>all</em> preferences.
+ */
+ @Override
+ public void setItemID(int i, long itemID) {
+ id = itemID;
+ }
+
+ /**
+ * @return all user IDs
+ */
+ @Override
+ public long[] getIDs() {
+ return ids;
+ }
+
+ @Override
+ public float getValue(int i) {
+ return 1.0f;
+ }
+
+ @Override
+ public void setValue(int i, float value) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public void sortByUser() {
+ Arrays.sort(ids);
+ }
+
+ @Override
+ public void sortByItem() { }
+
+ @Override
+ public void sortByValue() { }
+
+ @Override
+ public void sortByValueReversed() { }
+
+ @Override
+ public boolean hasPrefWithUserID(long userID) {
+ for (long id : ids) {
+ if (userID == id) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ @Override
+ public boolean hasPrefWithItemID(long itemID) {
+ return id == itemID;
+ }
+
+ @Override
+ public BooleanItemPreferenceArray clone() {
+ return new BooleanItemPreferenceArray(ids.clone(), id);
+ }
+
+ @Override
+ public int hashCode() {
+ return (int) (id >> 32) ^ (int) id ^ Arrays.hashCode(ids);
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ if (!(other instanceof BooleanItemPreferenceArray)) {
+ return false;
+ }
+ BooleanItemPreferenceArray otherArray = (BooleanItemPreferenceArray) other;
+ return id == otherArray.id && Arrays.equals(ids, otherArray.ids);
+ }
+
+ @Override
+ public Iterator<Preference> iterator() {
+ return Iterators.transform(new CountingIterator(length()),
+ new Function<Integer, Preference>() {
+ @Override
+ public Preference apply(Integer from) {
+ return new PreferenceView(from);
+ }
+ });
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder result = new StringBuilder(10 * ids.length);
+ result.append("BooleanItemPreferenceArray[itemID:");
+ result.append(id);
+ result.append(",{");
+ for (int i = 0; i < ids.length; i++) {
+ if (i > 0) {
+ result.append(',');
+ }
+ result.append(ids[i]);
+ }
+ result.append("}]");
+ return result.toString();
+ }
+
+ private final class PreferenceView implements Preference {
+
+ private final int i;
+
+ private PreferenceView(int i) {
+ this.i = i;
+ }
+
+ @Override
+ public long getUserID() {
+ return BooleanItemPreferenceArray.this.getUserID(i);
+ }
+
+ @Override
+ public long getItemID() {
+ return BooleanItemPreferenceArray.this.getItemID(i);
+ }
+
+ @Override
+ public float getValue() {
+ return 1.0f;
+ }
+
+ @Override
+ public void setValue(float value) {
+ throw new UnsupportedOperationException();
+ }
+
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/BooleanPreference.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/BooleanPreference.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/BooleanPreference.java
new file mode 100644
index 0000000..2093af8
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/BooleanPreference.java
@@ -0,0 +1,64 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.model;
+
+import java.io.Serializable;
+
+import org.apache.mahout.cf.taste.model.Preference;
+
+/**
+ * Encapsulates a simple boolean "preference" for an item whose value does not matter (is fixed at 1.0). This
+ * is appropriate in situations where users conceptually have only a general "yes" preference for items,
+ * rather than a spectrum of preference values.
+ */
+public final class BooleanPreference implements Preference, Serializable {
+
+ private final long userID;
+ private final long itemID;
+
+ public BooleanPreference(long userID, long itemID) {
+ this.userID = userID;
+ this.itemID = itemID;
+ }
+
+ @Override
+ public long getUserID() {
+ return userID;
+ }
+
+ @Override
+ public long getItemID() {
+ return itemID;
+ }
+
+ @Override
+ public float getValue() {
+ return 1.0f;
+ }
+
+ @Override
+ public void setValue(float value) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public String toString() {
+ return "BooleanPreference[userID: " + userID + ", itemID:" + itemID + ']';
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/BooleanUserPreferenceArray.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/BooleanUserPreferenceArray.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/BooleanUserPreferenceArray.java
new file mode 100644
index 0000000..629e0cf
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/BooleanUserPreferenceArray.java
@@ -0,0 +1,234 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.model;
+
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.List;
+
+import com.google.common.base.Function;
+import com.google.common.collect.Iterators;
+import org.apache.mahout.cf.taste.model.Preference;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.common.iterator.CountingIterator;
+
+/**
+ * <p>
+ * Like {@link GenericUserPreferenceArray} but stores, conceptually, {@link BooleanPreference} objects which
+ * have no associated preference value.
+ * </p>
+ *
+ * @see BooleanPreference
+ * @see BooleanItemPreferenceArray
+ * @see GenericUserPreferenceArray
+ */
+public final class BooleanUserPreferenceArray implements PreferenceArray {
+
+ private final long[] ids;
+ private long id;
+
+ public BooleanUserPreferenceArray(int size) {
+ this.ids = new long[size];
+ this.id = Long.MIN_VALUE; // as a sort of 'unspecified' value
+ }
+
+ public BooleanUserPreferenceArray(List<? extends Preference> prefs) {
+ this(prefs.size());
+ int size = prefs.size();
+ for (int i = 0; i < size; i++) {
+ Preference pref = prefs.get(i);
+ ids[i] = pref.getItemID();
+ }
+ if (size > 0) {
+ id = prefs.get(0).getUserID();
+ }
+ }
+
+ /**
+ * This is a private copy constructor for clone().
+ */
+ private BooleanUserPreferenceArray(long[] ids, long id) {
+ this.ids = ids;
+ this.id = id;
+ }
+
+ @Override
+ public int length() {
+ return ids.length;
+ }
+
+ @Override
+ public Preference get(int i) {
+ return new PreferenceView(i);
+ }
+
+ @Override
+ public void set(int i, Preference pref) {
+ id = pref.getUserID();
+ ids[i] = pref.getItemID();
+ }
+
+ @Override
+ public long getUserID(int i) {
+ return id;
+ }
+
+ /**
+ * {@inheritDoc}
+ *
+ * Note that this method will actually set the user ID for <em>all</em> preferences.
+ */
+ @Override
+ public void setUserID(int i, long userID) {
+ id = userID;
+ }
+
+ @Override
+ public long getItemID(int i) {
+ return ids[i];
+ }
+
+ @Override
+ public void setItemID(int i, long itemID) {
+ ids[i] = itemID;
+ }
+
+ /**
+ * @return all item IDs
+ */
+ @Override
+ public long[] getIDs() {
+ return ids;
+ }
+
+ @Override
+ public float getValue(int i) {
+ return 1.0f;
+ }
+
+ @Override
+ public void setValue(int i, float value) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public void sortByUser() { }
+
+ @Override
+ public void sortByItem() {
+ Arrays.sort(ids);
+ }
+
+ @Override
+ public void sortByValue() { }
+
+ @Override
+ public void sortByValueReversed() { }
+
+ @Override
+ public boolean hasPrefWithUserID(long userID) {
+ return id == userID;
+ }
+
+ @Override
+ public boolean hasPrefWithItemID(long itemID) {
+ for (long id : ids) {
+ if (itemID == id) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ @Override
+ public BooleanUserPreferenceArray clone() {
+ return new BooleanUserPreferenceArray(ids.clone(), id);
+ }
+
+ @Override
+ public int hashCode() {
+ return (int) (id >> 32) ^ (int) id ^ Arrays.hashCode(ids);
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ if (!(other instanceof BooleanUserPreferenceArray)) {
+ return false;
+ }
+ BooleanUserPreferenceArray otherArray = (BooleanUserPreferenceArray) other;
+ return id == otherArray.id && Arrays.equals(ids, otherArray.ids);
+ }
+
+ @Override
+ public Iterator<Preference> iterator() {
+ return Iterators.transform(new CountingIterator(length()),
+ new Function<Integer, Preference>() {
+ @Override
+ public Preference apply(Integer from) {
+ return new PreferenceView(from);
+ }
+ });
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder result = new StringBuilder(10 * ids.length);
+ result.append("BooleanUserPreferenceArray[userID:");
+ result.append(id);
+ result.append(",{");
+ for (int i = 0; i < ids.length; i++) {
+ if (i > 0) {
+ result.append(',');
+ }
+ result.append(ids[i]);
+ }
+ result.append("}]");
+ return result.toString();
+ }
+
+ private final class PreferenceView implements Preference {
+
+ private final int i;
+
+ private PreferenceView(int i) {
+ this.i = i;
+ }
+
+ @Override
+ public long getUserID() {
+ return BooleanUserPreferenceArray.this.getUserID(i);
+ }
+
+ @Override
+ public long getItemID() {
+ return BooleanUserPreferenceArray.this.getItemID(i);
+ }
+
+ @Override
+ public float getValue() {
+ return 1.0f;
+ }
+
+ @Override
+ public void setValue(float value) {
+ throw new UnsupportedOperationException();
+ }
+
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericBooleanPrefDataModel.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericBooleanPrefDataModel.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericBooleanPrefDataModel.java
new file mode 100644
index 0000000..2c1ff4d
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericBooleanPrefDataModel.java
@@ -0,0 +1,320 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.model;
+
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Map;
+
+import org.apache.mahout.cf.taste.common.NoSuchItemException;
+import org.apache.mahout.cf.taste.common.NoSuchUserException;
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveArrayIterator;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * <p>
+ * A simple {@link DataModel} which uses given user data as its data source. This implementation
+ * is mostly useful for small experiments and is not recommended for contexts where performance is important.
+ * </p>
+ */
+public final class GenericBooleanPrefDataModel extends AbstractDataModel {
+
+ private final long[] userIDs;
+ private final FastByIDMap<FastIDSet> preferenceFromUsers;
+ private final long[] itemIDs;
+ private final FastByIDMap<FastIDSet> preferenceForItems;
+ private final FastByIDMap<FastByIDMap<Long>> timestamps;
+
+ /**
+ * <p>
+ * Creates a new {@link GenericDataModel} from the given users (and their preferences). This
+ * {@link DataModel} retains all this information in memory and is effectively immutable.
+ * </p>
+ *
+ * @param userData users to include
+ */
+ public GenericBooleanPrefDataModel(FastByIDMap<FastIDSet> userData) {
+ this(userData, null);
+ }
+
+ /**
+ * <p>
+ * Creates a new {@link GenericDataModel} from the given users (and their preferences). This
+ * {@link DataModel} retains all this information in memory and is effectively immutable.
+ * </p>
+ *
+ * @param userData users to include
+ * @param timestamps optionally, provided timestamps of preferences as milliseconds since the epoch.
+ * User IDs are mapped to maps of item IDs to Long timestamps.
+ */
+ public GenericBooleanPrefDataModel(FastByIDMap<FastIDSet> userData, FastByIDMap<FastByIDMap<Long>> timestamps) {
+ Preconditions.checkArgument(userData != null, "userData is null");
+
+ this.preferenceFromUsers = userData;
+ this.preferenceForItems = new FastByIDMap<>();
+ FastIDSet itemIDSet = new FastIDSet();
+ for (Map.Entry<Long, FastIDSet> entry : preferenceFromUsers.entrySet()) {
+ long userID = entry.getKey();
+ FastIDSet itemIDs = entry.getValue();
+ itemIDSet.addAll(itemIDs);
+ LongPrimitiveIterator it = itemIDs.iterator();
+ while (it.hasNext()) {
+ long itemID = it.nextLong();
+ FastIDSet userIDs = preferenceForItems.get(itemID);
+ if (userIDs == null) {
+ userIDs = new FastIDSet(2);
+ preferenceForItems.put(itemID, userIDs);
+ }
+ userIDs.add(userID);
+ }
+ }
+
+ this.itemIDs = itemIDSet.toArray();
+ itemIDSet = null; // Might help GC -- this is big
+ Arrays.sort(itemIDs);
+
+ this.userIDs = new long[userData.size()];
+ int i = 0;
+ LongPrimitiveIterator it = userData.keySetIterator();
+ while (it.hasNext()) {
+ userIDs[i++] = it.next();
+ }
+ Arrays.sort(userIDs);
+
+ this.timestamps = timestamps;
+ }
+
+ /**
+ * <p>
+ * Creates a new {@link GenericDataModel} containing an immutable copy of the data from another given
+ * {@link DataModel}.
+ * </p>
+ *
+ * @param dataModel
+ * {@link DataModel} to copy
+ * @throws TasteException
+ * if an error occurs while retrieving the other {@link DataModel}'s users
+ * @deprecated without direct replacement.
+ * Consider {@link #toDataMap(DataModel)} with {@link #GenericBooleanPrefDataModel(FastByIDMap)}
+ */
+ @Deprecated
+ public GenericBooleanPrefDataModel(DataModel dataModel) throws TasteException {
+ this(toDataMap(dataModel));
+ }
+
+ /**
+ * Exports the simple user IDs and associated item IDs in the data model.
+ *
+ * @return a {@link FastByIDMap} mapping user IDs to {@link FastIDSet}s representing
+ * that user's associated items
+ */
+ public static FastByIDMap<FastIDSet> toDataMap(DataModel dataModel) throws TasteException {
+ FastByIDMap<FastIDSet> data = new FastByIDMap<>(dataModel.getNumUsers());
+ LongPrimitiveIterator it = dataModel.getUserIDs();
+ while (it.hasNext()) {
+ long userID = it.nextLong();
+ data.put(userID, dataModel.getItemIDsFromUser(userID));
+ }
+ return data;
+ }
+
+ public static FastByIDMap<FastIDSet> toDataMap(FastByIDMap<PreferenceArray> data) {
+ for (Map.Entry<Long,Object> entry : ((FastByIDMap<Object>) (FastByIDMap<?>) data).entrySet()) {
+ PreferenceArray prefArray = (PreferenceArray) entry.getValue();
+ int size = prefArray.length();
+ FastIDSet itemIDs = new FastIDSet(size);
+ for (int i = 0; i < size; i++) {
+ itemIDs.add(prefArray.getItemID(i));
+ }
+ entry.setValue(itemIDs);
+ }
+ return (FastByIDMap<FastIDSet>) (FastByIDMap<?>) data;
+ }
+
+ /**
+ * This is used mostly internally to the framework, and shouldn't be relied upon otherwise.
+ */
+ public FastByIDMap<FastIDSet> getRawUserData() {
+ return this.preferenceFromUsers;
+ }
+
+ /**
+ * This is used mostly internally to the framework, and shouldn't be relied upon otherwise.
+ */
+ public FastByIDMap<FastIDSet> getRawItemData() {
+ return this.preferenceForItems;
+ }
+
+ @Override
+ public LongPrimitiveArrayIterator getUserIDs() {
+ return new LongPrimitiveArrayIterator(userIDs);
+ }
+
+ /**
+ * @throws NoSuchUserException
+ * if there is no such user
+ */
+ @Override
+ public PreferenceArray getPreferencesFromUser(long userID) throws NoSuchUserException {
+ FastIDSet itemIDs = preferenceFromUsers.get(userID);
+ if (itemIDs == null) {
+ throw new NoSuchUserException(userID);
+ }
+ PreferenceArray prefArray = new BooleanUserPreferenceArray(itemIDs.size());
+ int i = 0;
+ LongPrimitiveIterator it = itemIDs.iterator();
+ while (it.hasNext()) {
+ prefArray.setUserID(i, userID);
+ prefArray.setItemID(i, it.nextLong());
+ i++;
+ }
+ return prefArray;
+ }
+
+ @Override
+ public FastIDSet getItemIDsFromUser(long userID) throws TasteException {
+ FastIDSet itemIDs = preferenceFromUsers.get(userID);
+ if (itemIDs == null) {
+ throw new NoSuchUserException(userID);
+ }
+ return itemIDs;
+ }
+
+ @Override
+ public LongPrimitiveArrayIterator getItemIDs() {
+ return new LongPrimitiveArrayIterator(itemIDs);
+ }
+
+ @Override
+ public PreferenceArray getPreferencesForItem(long itemID) throws NoSuchItemException {
+ FastIDSet userIDs = preferenceForItems.get(itemID);
+ if (userIDs == null) {
+ throw new NoSuchItemException(itemID);
+ }
+ PreferenceArray prefArray = new BooleanItemPreferenceArray(userIDs.size());
+ int i = 0;
+ LongPrimitiveIterator it = userIDs.iterator();
+ while (it.hasNext()) {
+ prefArray.setUserID(i, it.nextLong());
+ prefArray.setItemID(i, itemID);
+ i++;
+ }
+ return prefArray;
+ }
+
+ @Override
+ public Float getPreferenceValue(long userID, long itemID) throws NoSuchUserException {
+ FastIDSet itemIDs = preferenceFromUsers.get(userID);
+ if (itemIDs == null) {
+ throw new NoSuchUserException(userID);
+ }
+ if (itemIDs.contains(itemID)) {
+ return 1.0f;
+ }
+ return null;
+ }
+
+ @Override
+ public Long getPreferenceTime(long userID, long itemID) throws TasteException {
+ if (timestamps == null) {
+ return null;
+ }
+ FastByIDMap<Long> itemTimestamps = timestamps.get(userID);
+ if (itemTimestamps == null) {
+ throw new NoSuchUserException(userID);
+ }
+ return itemTimestamps.get(itemID);
+ }
+
+ @Override
+ public int getNumItems() {
+ return itemIDs.length;
+ }
+
+ @Override
+ public int getNumUsers() {
+ return userIDs.length;
+ }
+
+ @Override
+ public int getNumUsersWithPreferenceFor(long itemID) {
+ FastIDSet userIDs1 = preferenceForItems.get(itemID);
+ return userIDs1 == null ? 0 : userIDs1.size();
+ }
+
+ @Override
+ public int getNumUsersWithPreferenceFor(long itemID1, long itemID2) {
+ FastIDSet userIDs1 = preferenceForItems.get(itemID1);
+ if (userIDs1 == null) {
+ return 0;
+ }
+ FastIDSet userIDs2 = preferenceForItems.get(itemID2);
+ if (userIDs2 == null) {
+ return 0;
+ }
+ return userIDs1.size() < userIDs2.size()
+ ? userIDs2.intersectionSize(userIDs1)
+ : userIDs1.intersectionSize(userIDs2);
+ }
+
+ @Override
+ public void removePreference(long userID, long itemID) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public void setPreference(long userID, long itemID, float value) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public void refresh(Collection<Refreshable> alreadyRefreshed) {
+ // Does nothing
+ }
+
+ @Override
+ public boolean hasPreferenceValues() {
+ return false;
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder result = new StringBuilder(200);
+ result.append("GenericBooleanPrefDataModel[users:");
+ for (int i = 0; i < Math.min(3, userIDs.length); i++) {
+ if (i > 0) {
+ result.append(',');
+ }
+ result.append(userIDs[i]);
+ }
+ if (userIDs.length > 3) {
+ result.append("...");
+ }
+ result.append(']');
+ return result.toString();
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericDataModel.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericDataModel.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericDataModel.java
new file mode 100644
index 0000000..f58d349
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericDataModel.java
@@ -0,0 +1,361 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.model;
+
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.List;
+import java.util.Map;
+
+import com.google.common.collect.Lists;
+import org.apache.mahout.cf.taste.common.NoSuchItemException;
+import org.apache.mahout.cf.taste.common.NoSuchUserException;
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveArrayIterator;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.Preference;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * <p>
+ * A simple {@link DataModel} which uses a given {@link List} of users as its data source. This implementation
+ * is mostly useful for small experiments and is not recommended for contexts where performance is important.
+ * </p>
+ */
+public final class GenericDataModel extends AbstractDataModel {
+
+ private static final Logger log = LoggerFactory.getLogger(GenericDataModel.class);
+
+ private final long[] userIDs;
+ private final FastByIDMap<PreferenceArray> preferenceFromUsers;
+ private final long[] itemIDs;
+ private final FastByIDMap<PreferenceArray> preferenceForItems;
+ private final FastByIDMap<FastByIDMap<Long>> timestamps;
+
+ /**
+ * <p>
+ * Creates a new {@link GenericDataModel} from the given users (and their preferences). This
+ * {@link DataModel} retains all this information in memory and is effectively immutable.
+ * </p>
+ *
+ * @param userData users to include; (see also {@link #toDataMap(FastByIDMap, boolean)})
+ */
+ public GenericDataModel(FastByIDMap<PreferenceArray> userData) {
+ this(userData, null);
+ }
+
+ /**
+ * <p>
+ * Creates a new {@link GenericDataModel} from the given users (and their preferences). This
+ * {@link DataModel} retains all this information in memory and is effectively immutable.
+ * </p>
+ *
+ * @param userData users to include; (see also {@link #toDataMap(FastByIDMap, boolean)})
+ * @param timestamps optionally, provided timestamps of preferences as milliseconds since the epoch.
+ * User IDs are mapped to maps of item IDs to Long timestamps.
+ */
+ public GenericDataModel(FastByIDMap<PreferenceArray> userData, FastByIDMap<FastByIDMap<Long>> timestamps) {
+ Preconditions.checkArgument(userData != null, "userData is null");
+
+ this.preferenceFromUsers = userData;
+ FastByIDMap<Collection<Preference>> prefsForItems = new FastByIDMap<>();
+ FastIDSet itemIDSet = new FastIDSet();
+ int currentCount = 0;
+ float maxPrefValue = Float.NEGATIVE_INFINITY;
+ float minPrefValue = Float.POSITIVE_INFINITY;
+ for (Map.Entry<Long, PreferenceArray> entry : preferenceFromUsers.entrySet()) {
+ PreferenceArray prefs = entry.getValue();
+ prefs.sortByItem();
+ for (Preference preference : prefs) {
+ long itemID = preference.getItemID();
+ itemIDSet.add(itemID);
+ Collection<Preference> prefsForItem = prefsForItems.get(itemID);
+ if (prefsForItem == null) {
+ prefsForItem = Lists.newArrayListWithCapacity(2);
+ prefsForItems.put(itemID, prefsForItem);
+ }
+ prefsForItem.add(preference);
+ float value = preference.getValue();
+ if (value > maxPrefValue) {
+ maxPrefValue = value;
+ }
+ if (value < minPrefValue) {
+ minPrefValue = value;
+ }
+ }
+ if (++currentCount % 10000 == 0) {
+ log.info("Processed {} users", currentCount);
+ }
+ }
+ log.info("Processed {} users", currentCount);
+
+ setMinPreference(minPrefValue);
+ setMaxPreference(maxPrefValue);
+
+ this.itemIDs = itemIDSet.toArray();
+ itemIDSet = null; // Might help GC -- this is big
+ Arrays.sort(itemIDs);
+
+ this.preferenceForItems = toDataMap(prefsForItems, false);
+
+ for (Map.Entry<Long, PreferenceArray> entry : preferenceForItems.entrySet()) {
+ entry.getValue().sortByUser();
+ }
+
+ this.userIDs = new long[userData.size()];
+ int i = 0;
+ LongPrimitiveIterator it = userData.keySetIterator();
+ while (it.hasNext()) {
+ userIDs[i++] = it.next();
+ }
+ Arrays.sort(userIDs);
+
+ this.timestamps = timestamps;
+ }
+
+ /**
+ * <p>
+ * Creates a new {@link GenericDataModel} containing an immutable copy of the data from another given
+ * {@link DataModel}.
+ * </p>
+ *
+ * @param dataModel {@link DataModel} to copy
+ * @throws TasteException if an error occurs while retrieving the other {@link DataModel}'s users
+ * @deprecated without direct replacement.
+ * Consider {@link #toDataMap(DataModel)} with {@link #GenericDataModel(FastByIDMap)}
+ */
+ @Deprecated
+ public GenericDataModel(DataModel dataModel) throws TasteException {
+ this(toDataMap(dataModel));
+ }
+
+ /**
+ * Swaps, in-place, {@link List}s for arrays in {@link Map} values .
+ *
+ * @return input value
+ */
+ public static FastByIDMap<PreferenceArray> toDataMap(FastByIDMap<Collection<Preference>> data,
+ boolean byUser) {
+ for (Map.Entry<Long,Object> entry : ((FastByIDMap<Object>) (FastByIDMap<?>) data).entrySet()) {
+ List<Preference> prefList = (List<Preference>) entry.getValue();
+ entry.setValue(byUser ? new GenericUserPreferenceArray(prefList) : new GenericItemPreferenceArray(
+ prefList));
+ }
+ return (FastByIDMap<PreferenceArray>) (FastByIDMap<?>) data;
+ }
+
+ /**
+ * Exports the simple user IDs and preferences in the data model.
+ *
+ * @return a {@link FastByIDMap} mapping user IDs to {@link PreferenceArray}s representing
+ * that user's preferences
+ */
+ public static FastByIDMap<PreferenceArray> toDataMap(DataModel dataModel) throws TasteException {
+ FastByIDMap<PreferenceArray> data = new FastByIDMap<>(dataModel.getNumUsers());
+ LongPrimitiveIterator it = dataModel.getUserIDs();
+ while (it.hasNext()) {
+ long userID = it.nextLong();
+ data.put(userID, dataModel.getPreferencesFromUser(userID));
+ }
+ return data;
+ }
+
+ /**
+ * This is used mostly internally to the framework, and shouldn't be relied upon otherwise.
+ */
+ public FastByIDMap<PreferenceArray> getRawUserData() {
+ return this.preferenceFromUsers;
+ }
+
+ /**
+ * This is used mostly internally to the framework, and shouldn't be relied upon otherwise.
+ */
+ public FastByIDMap<PreferenceArray> getRawItemData() {
+ return this.preferenceForItems;
+ }
+
+ @Override
+ public LongPrimitiveArrayIterator getUserIDs() {
+ return new LongPrimitiveArrayIterator(userIDs);
+ }
+
+ /**
+ * @throws NoSuchUserException
+ * if there is no such user
+ */
+ @Override
+ public PreferenceArray getPreferencesFromUser(long userID) throws NoSuchUserException {
+ PreferenceArray prefs = preferenceFromUsers.get(userID);
+ if (prefs == null) {
+ throw new NoSuchUserException(userID);
+ }
+ return prefs;
+ }
+
+ @Override
+ public FastIDSet getItemIDsFromUser(long userID) throws TasteException {
+ PreferenceArray prefs = getPreferencesFromUser(userID);
+ int size = prefs.length();
+ FastIDSet result = new FastIDSet(size);
+ for (int i = 0; i < size; i++) {
+ result.add(prefs.getItemID(i));
+ }
+ return result;
+ }
+
+ @Override
+ public LongPrimitiveArrayIterator getItemIDs() {
+ return new LongPrimitiveArrayIterator(itemIDs);
+ }
+
+ @Override
+ public PreferenceArray getPreferencesForItem(long itemID) throws NoSuchItemException {
+ PreferenceArray prefs = preferenceForItems.get(itemID);
+ if (prefs == null) {
+ throw new NoSuchItemException(itemID);
+ }
+ return prefs;
+ }
+
+ @Override
+ public Float getPreferenceValue(long userID, long itemID) throws TasteException {
+ PreferenceArray prefs = getPreferencesFromUser(userID);
+ int size = prefs.length();
+ for (int i = 0; i < size; i++) {
+ if (prefs.getItemID(i) == itemID) {
+ return prefs.getValue(i);
+ }
+ }
+ return null;
+ }
+
+ @Override
+ public Long getPreferenceTime(long userID, long itemID) throws TasteException {
+ if (timestamps == null) {
+ return null;
+ }
+ FastByIDMap<Long> itemTimestamps = timestamps.get(userID);
+ if (itemTimestamps == null) {
+ throw new NoSuchUserException(userID);
+ }
+ return itemTimestamps.get(itemID);
+ }
+
+ @Override
+ public int getNumItems() {
+ return itemIDs.length;
+ }
+
+ @Override
+ public int getNumUsers() {
+ return userIDs.length;
+ }
+
+ @Override
+ public int getNumUsersWithPreferenceFor(long itemID) {
+ PreferenceArray prefs1 = preferenceForItems.get(itemID);
+ return prefs1 == null ? 0 : prefs1.length();
+ }
+
+ @Override
+ public int getNumUsersWithPreferenceFor(long itemID1, long itemID2) {
+ PreferenceArray prefs1 = preferenceForItems.get(itemID1);
+ if (prefs1 == null) {
+ return 0;
+ }
+ PreferenceArray prefs2 = preferenceForItems.get(itemID2);
+ if (prefs2 == null) {
+ return 0;
+ }
+
+ int size1 = prefs1.length();
+ int size2 = prefs2.length();
+ int count = 0;
+ int i = 0;
+ int j = 0;
+ long userID1 = prefs1.getUserID(0);
+ long userID2 = prefs2.getUserID(0);
+ while (true) {
+ if (userID1 < userID2) {
+ if (++i == size1) {
+ break;
+ }
+ userID1 = prefs1.getUserID(i);
+ } else if (userID1 > userID2) {
+ if (++j == size2) {
+ break;
+ }
+ userID2 = prefs2.getUserID(j);
+ } else {
+ count++;
+ if (++i == size1 || ++j == size2) {
+ break;
+ }
+ userID1 = prefs1.getUserID(i);
+ userID2 = prefs2.getUserID(j);
+ }
+ }
+ return count;
+ }
+
+ @Override
+ public void removePreference(long userID, long itemID) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public void setPreference(long userID, long itemID, float value) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public void refresh(Collection<Refreshable> alreadyRefreshed) {
+ // Does nothing
+ }
+
+ @Override
+ public boolean hasPreferenceValues() {
+ return true;
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder result = new StringBuilder(200);
+ result.append("GenericDataModel[users:");
+ for (int i = 0; i < Math.min(3, userIDs.length); i++) {
+ if (i > 0) {
+ result.append(',');
+ }
+ result.append(userIDs[i]);
+ }
+ if (userIDs.length > 3) {
+ result.append("...");
+ }
+ result.append(']');
+ return result.toString();
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericItemPreferenceArray.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericItemPreferenceArray.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericItemPreferenceArray.java
new file mode 100644
index 0000000..fde9314
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericItemPreferenceArray.java
@@ -0,0 +1,301 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.model;
+
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.List;
+
+import com.google.common.base.Function;
+import com.google.common.collect.Iterators;
+import org.apache.mahout.cf.taste.model.Preference;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.common.iterator.CountingIterator;
+
+/**
+ * <p>
+ * Like {@link GenericUserPreferenceArray} but stores preferences for one item (all item IDs the same) rather
+ * than one user.
+ * </p>
+ *
+ * @see BooleanItemPreferenceArray
+ * @see GenericUserPreferenceArray
+ * @see GenericPreference
+ */
+public final class GenericItemPreferenceArray implements PreferenceArray {
+
+ private static final int USER = 0;
+ private static final int VALUE = 2;
+ private static final int VALUE_REVERSED = 3;
+
+ private final long[] ids;
+ private long id;
+ private final float[] values;
+
+ public GenericItemPreferenceArray(int size) {
+ this.ids = new long[size];
+ values = new float[size];
+ this.id = Long.MIN_VALUE; // as a sort of 'unspecified' value
+ }
+
+ public GenericItemPreferenceArray(List<? extends Preference> prefs) {
+ this(prefs.size());
+ int size = prefs.size();
+ long itemID = Long.MIN_VALUE;
+ for (int i = 0; i < size; i++) {
+ Preference pref = prefs.get(i);
+ ids[i] = pref.getUserID();
+ if (i == 0) {
+ itemID = pref.getItemID();
+ } else {
+ if (itemID != pref.getItemID()) {
+ throw new IllegalArgumentException("Not all item IDs are the same");
+ }
+ }
+ values[i] = pref.getValue();
+ }
+ id = itemID;
+ }
+
+ /**
+ * This is a private copy constructor for clone().
+ */
+ private GenericItemPreferenceArray(long[] ids, long id, float[] values) {
+ this.ids = ids;
+ this.id = id;
+ this.values = values;
+ }
+
+ @Override
+ public int length() {
+ return ids.length;
+ }
+
+ @Override
+ public Preference get(int i) {
+ return new PreferenceView(i);
+ }
+
+ @Override
+ public void set(int i, Preference pref) {
+ id = pref.getItemID();
+ ids[i] = pref.getUserID();
+ values[i] = pref.getValue();
+ }
+
+ @Override
+ public long getUserID(int i) {
+ return ids[i];
+ }
+
+ @Override
+ public void setUserID(int i, long userID) {
+ ids[i] = userID;
+ }
+
+ @Override
+ public long getItemID(int i) {
+ return id;
+ }
+
+ /**
+ * {@inheritDoc}
+ *
+ * Note that this method will actually set the item ID for <em>all</em> preferences.
+ */
+ @Override
+ public void setItemID(int i, long itemID) {
+ id = itemID;
+ }
+
+ /**
+ * @return all user IDs
+ */
+ @Override
+ public long[] getIDs() {
+ return ids;
+ }
+
+ @Override
+ public float getValue(int i) {
+ return values[i];
+ }
+
+ @Override
+ public void setValue(int i, float value) {
+ values[i] = value;
+ }
+
+ @Override
+ public void sortByUser() {
+ lateralSort(USER);
+ }
+
+ @Override
+ public void sortByItem() { }
+
+ @Override
+ public void sortByValue() {
+ lateralSort(VALUE);
+ }
+
+ @Override
+ public void sortByValueReversed() {
+ lateralSort(VALUE_REVERSED);
+ }
+
+ @Override
+ public boolean hasPrefWithUserID(long userID) {
+ for (long id : ids) {
+ if (userID == id) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ @Override
+ public boolean hasPrefWithItemID(long itemID) {
+ return id == itemID;
+ }
+
+ private void lateralSort(int type) {
+ //Comb sort: http://en.wikipedia.org/wiki/Comb_sort
+ int length = length();
+ int gap = length;
+ boolean swapped = false;
+ while (gap > 1 || swapped) {
+ if (gap > 1) {
+ gap /= 1.247330950103979; // = 1 / (1 - 1/e^phi)
+ }
+ swapped = false;
+ int max = length - gap;
+ for (int i = 0; i < max; i++) {
+ int other = i + gap;
+ if (isLess(other, i, type)) {
+ swap(i, other);
+ swapped = true;
+ }
+ }
+ }
+ }
+
+ private boolean isLess(int i, int j, int type) {
+ switch (type) {
+ case USER:
+ return ids[i] < ids[j];
+ case VALUE:
+ return values[i] < values[j];
+ case VALUE_REVERSED:
+ return values[i] > values[j];
+ default:
+ throw new IllegalStateException();
+ }
+ }
+
+ private void swap(int i, int j) {
+ long temp1 = ids[i];
+ float temp2 = values[i];
+ ids[i] = ids[j];
+ values[i] = values[j];
+ ids[j] = temp1;
+ values[j] = temp2;
+ }
+
+ @Override
+ public GenericItemPreferenceArray clone() {
+ return new GenericItemPreferenceArray(ids.clone(), id, values.clone());
+ }
+
+ @Override
+ public int hashCode() {
+ return (int) (id >> 32) ^ (int) id ^ Arrays.hashCode(ids) ^ Arrays.hashCode(values);
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ if (!(other instanceof GenericItemPreferenceArray)) {
+ return false;
+ }
+ GenericItemPreferenceArray otherArray = (GenericItemPreferenceArray) other;
+ return id == otherArray.id && Arrays.equals(ids, otherArray.ids) && Arrays.equals(values, otherArray.values);
+ }
+
+ @Override
+ public Iterator<Preference> iterator() {
+ return Iterators.transform(new CountingIterator(length()),
+ new Function<Integer, Preference>() {
+ @Override
+ public Preference apply(Integer from) {
+ return new PreferenceView(from);
+ }
+ });
+ }
+
+ @Override
+ public String toString() {
+ if (ids == null || ids.length == 0) {
+ return "GenericItemPreferenceArray[{}]";
+ }
+ StringBuilder result = new StringBuilder(20 * ids.length);
+ result.append("GenericItemPreferenceArray[itemID:");
+ result.append(id);
+ result.append(",{");
+ for (int i = 0; i < ids.length; i++) {
+ if (i > 0) {
+ result.append(',');
+ }
+ result.append(ids[i]);
+ result.append('=');
+ result.append(values[i]);
+ }
+ result.append("}]");
+ return result.toString();
+ }
+
+ private final class PreferenceView implements Preference {
+
+ private final int i;
+
+ private PreferenceView(int i) {
+ this.i = i;
+ }
+
+ @Override
+ public long getUserID() {
+ return GenericItemPreferenceArray.this.getUserID(i);
+ }
+
+ @Override
+ public long getItemID() {
+ return GenericItemPreferenceArray.this.getItemID(i);
+ }
+
+ @Override
+ public float getValue() {
+ return values[i];
+ }
+
+ @Override
+ public void setValue(float value) {
+ values[i] = value;
+ }
+
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericPreference.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericPreference.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericPreference.java
new file mode 100644
index 0000000..e6c7f43
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericPreference.java
@@ -0,0 +1,70 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.model;
+
+import java.io.Serializable;
+
+import org.apache.mahout.cf.taste.model.Preference;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * <p>
+ * A simple {@link Preference} encapsulating an item and preference value.
+ * </p>
+ */
+public class GenericPreference implements Preference, Serializable {
+
+ private final long userID;
+ private final long itemID;
+ private float value;
+
+ public GenericPreference(long userID, long itemID, float value) {
+ Preconditions.checkArgument(!Float.isNaN(value), "NaN value");
+ this.userID = userID;
+ this.itemID = itemID;
+ this.value = value;
+ }
+
+ @Override
+ public long getUserID() {
+ return userID;
+ }
+
+ @Override
+ public long getItemID() {
+ return itemID;
+ }
+
+ @Override
+ public float getValue() {
+ return value;
+ }
+
+ @Override
+ public void setValue(float value) {
+ Preconditions.checkArgument(!Float.isNaN(value), "NaN value");
+ this.value = value;
+ }
+
+ @Override
+ public String toString() {
+ return "GenericPreference[userID: " + userID + ", itemID:" + itemID + ", value:" + value + ']';
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericUserPreferenceArray.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericUserPreferenceArray.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericUserPreferenceArray.java
new file mode 100644
index 0000000..647feeb
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericUserPreferenceArray.java
@@ -0,0 +1,307 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.model;
+
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.List;
+
+import com.google.common.base.Function;
+import com.google.common.collect.Iterators;
+import org.apache.mahout.cf.taste.model.Preference;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.common.iterator.CountingIterator;
+
+/**
+ * <p>
+ * Like {@link GenericItemPreferenceArray} but stores preferences for one user (all user IDs the same) rather
+ * than one item.
+ * </p>
+ *
+ * <p>
+ * This implementation maintains two parallel arrays, of item IDs and values. The idea is to save allocating
+ * {@link Preference} objects themselves. This saves the overhead of {@link Preference} objects but also
+ * duplicating the user ID value.
+ * </p>
+ *
+ * @see BooleanUserPreferenceArray
+ * @see GenericItemPreferenceArray
+ * @see GenericPreference
+ */
+public final class GenericUserPreferenceArray implements PreferenceArray {
+
+ private static final int ITEM = 1;
+ private static final int VALUE = 2;
+ private static final int VALUE_REVERSED = 3;
+
+ private final long[] ids;
+ private long id;
+ private final float[] values;
+
+ public GenericUserPreferenceArray(int size) {
+ this.ids = new long[size];
+ values = new float[size];
+ this.id = Long.MIN_VALUE; // as a sort of 'unspecified' value
+ }
+
+ public GenericUserPreferenceArray(List<? extends Preference> prefs) {
+ this(prefs.size());
+ int size = prefs.size();
+ long userID = Long.MIN_VALUE;
+ for (int i = 0; i < size; i++) {
+ Preference pref = prefs.get(i);
+ if (i == 0) {
+ userID = pref.getUserID();
+ } else {
+ if (userID != pref.getUserID()) {
+ throw new IllegalArgumentException("Not all user IDs are the same");
+ }
+ }
+ ids[i] = pref.getItemID();
+ values[i] = pref.getValue();
+ }
+ id = userID;
+ }
+
+ /**
+ * This is a private copy constructor for clone().
+ */
+ private GenericUserPreferenceArray(long[] ids, long id, float[] values) {
+ this.ids = ids;
+ this.id = id;
+ this.values = values;
+ }
+
+ @Override
+ public int length() {
+ return ids.length;
+ }
+
+ @Override
+ public Preference get(int i) {
+ return new PreferenceView(i);
+ }
+
+ @Override
+ public void set(int i, Preference pref) {
+ id = pref.getUserID();
+ ids[i] = pref.getItemID();
+ values[i] = pref.getValue();
+ }
+
+ @Override
+ public long getUserID(int i) {
+ return id;
+ }
+
+ /**
+ * {@inheritDoc}
+ *
+ * Note that this method will actually set the user ID for <em>all</em> preferences.
+ */
+ @Override
+ public void setUserID(int i, long userID) {
+ id = userID;
+ }
+
+ @Override
+ public long getItemID(int i) {
+ return ids[i];
+ }
+
+ @Override
+ public void setItemID(int i, long itemID) {
+ ids[i] = itemID;
+ }
+
+ /**
+ * @return all item IDs
+ */
+ @Override
+ public long[] getIDs() {
+ return ids;
+ }
+
+ @Override
+ public float getValue(int i) {
+ return values[i];
+ }
+
+ @Override
+ public void setValue(int i, float value) {
+ values[i] = value;
+ }
+
+ @Override
+ public void sortByUser() { }
+
+ @Override
+ public void sortByItem() {
+ lateralSort(ITEM);
+ }
+
+ @Override
+ public void sortByValue() {
+ lateralSort(VALUE);
+ }
+
+ @Override
+ public void sortByValueReversed() {
+ lateralSort(VALUE_REVERSED);
+ }
+
+ @Override
+ public boolean hasPrefWithUserID(long userID) {
+ return id == userID;
+ }
+
+ @Override
+ public boolean hasPrefWithItemID(long itemID) {
+ for (long id : ids) {
+ if (itemID == id) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ private void lateralSort(int type) {
+ //Comb sort: http://en.wikipedia.org/wiki/Comb_sort
+ int length = length();
+ int gap = length;
+ boolean swapped = false;
+ while (gap > 1 || swapped) {
+ if (gap > 1) {
+ gap /= 1.247330950103979; // = 1 / (1 - 1/e^phi)
+ }
+ swapped = false;
+ int max = length - gap;
+ for (int i = 0; i < max; i++) {
+ int other = i + gap;
+ if (isLess(other, i, type)) {
+ swap(i, other);
+ swapped = true;
+ }
+ }
+ }
+ }
+
+ private boolean isLess(int i, int j, int type) {
+ switch (type) {
+ case ITEM:
+ return ids[i] < ids[j];
+ case VALUE:
+ return values[i] < values[j];
+ case VALUE_REVERSED:
+ return values[i] > values[j];
+ default:
+ throw new IllegalStateException();
+ }
+ }
+
+ private void swap(int i, int j) {
+ long temp1 = ids[i];
+ float temp2 = values[i];
+ ids[i] = ids[j];
+ values[i] = values[j];
+ ids[j] = temp1;
+ values[j] = temp2;
+ }
+
+ @Override
+ public GenericUserPreferenceArray clone() {
+ return new GenericUserPreferenceArray(ids.clone(), id, values.clone());
+ }
+
+ @Override
+ public int hashCode() {
+ return (int) (id >> 32) ^ (int) id ^ Arrays.hashCode(ids) ^ Arrays.hashCode(values);
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ if (!(other instanceof GenericUserPreferenceArray)) {
+ return false;
+ }
+ GenericUserPreferenceArray otherArray = (GenericUserPreferenceArray) other;
+ return id == otherArray.id && Arrays.equals(ids, otherArray.ids) && Arrays.equals(values, otherArray.values);
+ }
+
+ @Override
+ public Iterator<Preference> iterator() {
+ return Iterators.transform(new CountingIterator(length()),
+ new Function<Integer, Preference>() {
+ @Override
+ public Preference apply(Integer from) {
+ return new PreferenceView(from);
+ }
+ });
+ }
+
+ @Override
+ public String toString() {
+ if (ids == null || ids.length == 0) {
+ return "GenericUserPreferenceArray[{}]";
+ }
+ StringBuilder result = new StringBuilder(20 * ids.length);
+ result.append("GenericUserPreferenceArray[userID:");
+ result.append(id);
+ result.append(",{");
+ for (int i = 0; i < ids.length; i++) {
+ if (i > 0) {
+ result.append(',');
+ }
+ result.append(ids[i]);
+ result.append('=');
+ result.append(values[i]);
+ }
+ result.append("}]");
+ return result.toString();
+ }
+
+ private final class PreferenceView implements Preference {
+
+ private final int i;
+
+ private PreferenceView(int i) {
+ this.i = i;
+ }
+
+ @Override
+ public long getUserID() {
+ return GenericUserPreferenceArray.this.getUserID(i);
+ }
+
+ @Override
+ public long getItemID() {
+ return GenericUserPreferenceArray.this.getItemID(i);
+ }
+
+ @Override
+ public float getValue() {
+ return values[i];
+ }
+
+ @Override
+ public void setValue(float value) {
+ values[i] = value;
+ }
+
+ }
+
+}
r***@apache.org
2018-06-28 14:54:54 UTC
Permalink
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/eval/RelevantItemsDataSplitter.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/eval/RelevantItemsDataSplitter.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/eval/RelevantItemsDataSplitter.java
new file mode 100644
index 0000000..da318d5
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/eval/RelevantItemsDataSplitter.java
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.eval;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+
+/**
+ * Implementations of this interface determine the items that are considered relevant,
+ * and splits data into a training and test subset, for purposes of precision/recall
+ * tests as implemented by implementations of {@link RecommenderIRStatsEvaluator}.
+ */
+public interface RelevantItemsDataSplitter {
+
+ /**
+ * During testing, relevant items are removed from a particular users' preferences,
+ * and a model is build using this user's other preferences and all other users.
+ *
+ * @param at Maximum number of items to be removed
+ * @param relevanceThreshold Minimum strength of preference for an item to be considered
+ * relevant
+ * @return IDs of relevant items
+ */
+ FastIDSet getRelevantItemsIDs(long userID,
+ int at,
+ double relevanceThreshold,
+ DataModel dataModel) throws TasteException;
+
+ /**
+ * Adds a single user and all their preferences to the training model.
+ *
+ * @param userID ID of user whose preferences we are trying to predict
+ * @param relevantItemIDs IDs of items considered relevant to that user
+ * @param trainingUsers the database of training preferences to which we will
+ * append the ones for otherUserID.
+ * @param otherUserID for whom we are adding preferences to the training model
+ */
+ void processOtherUser(long userID,
+ FastIDSet relevantItemIDs,
+ FastByIDMap<PreferenceArray> trainingUsers,
+ long otherUserID,
+ DataModel dataModel) throws TasteException;
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/EntityEntityWritable.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/EntityEntityWritable.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/EntityEntityWritable.java
new file mode 100644
index 0000000..e70a675
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/EntityEntityWritable.java
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import com.google.common.primitives.Longs;
+import org.apache.hadoop.io.WritableComparable;
+import org.apache.mahout.math.Varint;
+
+/** A {@link WritableComparable} encapsulating two items. */
+public final class EntityEntityWritable implements WritableComparable<EntityEntityWritable>, Cloneable {
+
+ private long aID;
+ private long bID;
+
+ public EntityEntityWritable() {
+ // do nothing
+ }
+
+ public EntityEntityWritable(long aID, long bID) {
+ this.aID = aID;
+ this.bID = bID;
+ }
+
+ long getAID() {
+ return aID;
+ }
+
+ long getBID() {
+ return bID;
+ }
+
+ @Override
+ public void write(DataOutput out) throws IOException {
+ Varint.writeSignedVarLong(aID, out);
+ Varint.writeSignedVarLong(bID, out);
+ }
+
+ @Override
+ public void readFields(DataInput in) throws IOException {
+ aID = Varint.readSignedVarLong(in);
+ bID = Varint.readSignedVarLong(in);
+ }
+
+ @Override
+ public int compareTo(EntityEntityWritable that) {
+ int aCompare = compare(aID, that.getAID());
+ return aCompare == 0 ? compare(bID, that.getBID()) : aCompare;
+ }
+
+ private static int compare(long a, long b) {
+ return a < b ? -1 : a > b ? 1 : 0;
+ }
+
+ @Override
+ public int hashCode() {
+ return Longs.hashCode(aID) + 31 * Longs.hashCode(bID);
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (o instanceof EntityEntityWritable) {
+ EntityEntityWritable that = (EntityEntityWritable) o;
+ return aID == that.getAID() && bID == that.getBID();
+ }
+ return false;
+ }
+
+ @Override
+ public String toString() {
+ return aID + "\t" + bID;
+ }
+
+ @Override
+ public EntityEntityWritable clone() {
+ return new EntityEntityWritable(aID, bID);
+ }
+
+}
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/EntityPrefWritable.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/EntityPrefWritable.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/EntityPrefWritable.java
new file mode 100644
index 0000000..2aab63c
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/EntityPrefWritable.java
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.math.VarLongWritable;
+
+/** A {@link org.apache.hadoop.io.Writable} encapsulating an item ID and a preference value. */
+public final class EntityPrefWritable extends VarLongWritable implements Cloneable {
+
+ private float prefValue;
+
+ public EntityPrefWritable() {
+ // do nothing
+ }
+
+ public EntityPrefWritable(long itemID, float prefValue) {
+ super(itemID);
+ this.prefValue = prefValue;
+ }
+
+ public EntityPrefWritable(EntityPrefWritable other) {
+ this(other.get(), other.getPrefValue());
+ }
+
+ public long getID() {
+ return get();
+ }
+
+ public float getPrefValue() {
+ return prefValue;
+ }
+
+ @Override
+ public void write(DataOutput out) throws IOException {
+ super.write(out);
+ out.writeFloat(prefValue);
+ }
+
+ @Override
+ public void readFields(DataInput in) throws IOException {
+ super.readFields(in);
+ prefValue = in.readFloat();
+ }
+
+ @Override
+ public int hashCode() {
+ return super.hashCode() ^ RandomUtils.hashFloat(prefValue);
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (!(o instanceof EntityPrefWritable)) {
+ return false;
+ }
+ EntityPrefWritable other = (EntityPrefWritable) o;
+ return get() == other.get() && prefValue == other.getPrefValue();
+ }
+
+ @Override
+ public String toString() {
+ return get() + "\t" + prefValue;
+ }
+
+ @Override
+ public EntityPrefWritable clone() {
+ return new EntityPrefWritable(get(), prefValue);
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/MutableRecommendedItem.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/MutableRecommendedItem.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/MutableRecommendedItem.java
new file mode 100644
index 0000000..3de272d
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/MutableRecommendedItem.java
@@ -0,0 +1,81 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop;
+
+import org.apache.mahout.cf.taste.recommender.RecommendedItem;
+import org.apache.mahout.common.RandomUtils;
+
+/**
+ * Mutable variant of {@link RecommendedItem}
+ */
+public class MutableRecommendedItem implements RecommendedItem {
+
+ private long itemID;
+ private float value;
+
+ public MutableRecommendedItem() {}
+
+ public MutableRecommendedItem(long itemID, float value) {
+ this.itemID = itemID;
+ this.value = value;
+ }
+
+ @Override
+ public long getItemID() {
+ return itemID;
+ }
+
+ @Override
+ public float getValue() {
+ return value;
+ }
+
+ public void setItemID(long itemID) {
+ this.itemID = itemID;
+ }
+
+ public void set(long itemID, float value) {
+ this.itemID = itemID;
+ this.value = value;
+ }
+
+ public void capToMaxValue(float maxValue) {
+ if (value > maxValue) {
+ value = maxValue;
+ }
+ }
+
+ @Override
+ public String toString() {
+ return "MutableRecommendedItem[item:" + itemID + ", value:" + value + ']';
+ }
+
+ @Override
+ public int hashCode() {
+ return (int) itemID ^ RandomUtils.hashFloat(value);
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (!(o instanceof MutableRecommendedItem)) {
+ return false;
+ }
+ RecommendedItem other = (RecommendedItem) o;
+ return itemID == other.getItemID() && value == other.getValue();
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/RecommendedItemsWritable.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/RecommendedItemsWritable.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/RecommendedItemsWritable.java
new file mode 100644
index 0000000..bc832aa
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/RecommendedItemsWritable.java
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.cf.taste.impl.recommender.GenericRecommendedItem;
+import org.apache.mahout.cf.taste.recommender.RecommendedItem;
+import org.apache.mahout.math.Varint;
+
+/**
+ * A {@link Writable} which encapsulates a list of {@link RecommendedItem}s. This is the mapper (and reducer)
+ * output, and represents items recommended to a user. The first item is the one whose estimated preference is
+ * highest.
+ */
+public final class RecommendedItemsWritable implements Writable {
+
+ private List<RecommendedItem> recommended;
+
+ public RecommendedItemsWritable() {
+ // do nothing
+ }
+
+ public RecommendedItemsWritable(List<RecommendedItem> recommended) {
+ this.recommended = recommended;
+ }
+
+ public List<RecommendedItem> getRecommendedItems() {
+ return recommended;
+ }
+
+ public void set(List<RecommendedItem> recommended) {
+ this.recommended = recommended;
+ }
+
+ @Override
+ public void write(DataOutput out) throws IOException {
+ out.writeInt(recommended.size());
+ for (RecommendedItem item : recommended) {
+ Varint.writeSignedVarLong(item.getItemID(), out);
+ out.writeFloat(item.getValue());
+ }
+ }
+
+ @Override
+ public void readFields(DataInput in) throws IOException {
+ int size = in.readInt();
+ recommended = new ArrayList<>(size);
+ for (int i = 0; i < size; i++) {
+ long itemID = Varint.readSignedVarLong(in);
+ float value = in.readFloat();
+ RecommendedItem recommendedItem = new GenericRecommendedItem(itemID, value);
+ recommended.add(recommendedItem);
+ }
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder result = new StringBuilder(200);
+ result.append('[');
+ boolean first = true;
+ for (RecommendedItem item : recommended) {
+ if (first) {
+ first = false;
+ } else {
+ result.append(',');
+ }
+ result.append(String.valueOf(item.getItemID()));
+ result.append(':');
+ result.append(String.valueOf(item.getValue()));
+ }
+ result.append(']');
+ return result.toString();
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/TasteHadoopUtils.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/TasteHadoopUtils.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/TasteHadoopUtils.java
new file mode 100644
index 0000000..e3fab29
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/TasteHadoopUtils.java
@@ -0,0 +1,84 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop;
+
+import com.google.common.primitives.Longs;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
+import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
+import org.apache.mahout.math.VarIntWritable;
+import org.apache.mahout.math.VarLongWritable;
+import org.apache.mahout.math.map.OpenIntLongHashMap;
+
+import java.util.regex.Pattern;
+
+/**
+ * Some helper methods for the hadoop-related stuff in org.apache.mahout.cf.taste
+ */
+public final class TasteHadoopUtils {
+
+ public static final int USER_ID_POS = 0;
+ public static final int ITEM_ID_POS = 1;
+
+ /** Standard delimiter of textual preference data */
+ private static final Pattern PREFERENCE_TOKEN_DELIMITER = Pattern.compile("[\t,]");
+
+ private TasteHadoopUtils() {}
+
+ /**
+ * Splits a preference data line into string tokens
+ */
+ public static String[] splitPrefTokens(CharSequence line) {
+ return PREFERENCE_TOKEN_DELIMITER.split(line);
+ }
+
+ /**
+ * Maps a long to an int with range of 0 to Integer.MAX_VALUE-1
+ */
+ public static int idToIndex(long id) {
+ return 0x7FFFFFFF & Longs.hashCode(id) % 0x7FFFFFFE;
+ }
+
+ public static int readID(String token, boolean usesLongIDs) {
+ return usesLongIDs ? idToIndex(Long.parseLong(token)) : Integer.parseInt(token);
+ }
+
+ /**
+ * Reads a binary mapping file
+ */
+ public static OpenIntLongHashMap readIDIndexMap(String idIndexPathStr, Configuration conf) {
+ OpenIntLongHashMap indexIDMap = new OpenIntLongHashMap();
+ Path itemIDIndexPath = new Path(idIndexPathStr);
+ for (Pair<VarIntWritable,VarLongWritable> record
+ : new SequenceFileDirIterable<VarIntWritable,VarLongWritable>(itemIDIndexPath,
+ PathType.LIST,
+ PathFilters.partFilter(),
+ null,
+ true,
+ conf)) {
+ indexIDMap.put(record.getFirst().get(), record.getSecond().get());
+ }
+ return indexIDMap;
+ }
+
+
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/ToEntityPrefsMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/ToEntityPrefsMapper.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/ToEntityPrefsMapper.java
new file mode 100644
index 0000000..fdb552e
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/ToEntityPrefsMapper.java
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.cf.taste.hadoop.item.RecommenderJob;
+import org.apache.mahout.math.VarLongWritable;
+
+import java.io.IOException;
+import java.util.regex.Pattern;
+
+public abstract class ToEntityPrefsMapper extends
+ Mapper<LongWritable,Text, VarLongWritable,VarLongWritable> {
+
+ public static final String TRANSPOSE_USER_ITEM = ToEntityPrefsMapper.class + "transposeUserItem";
+ public static final String RATING_SHIFT = ToEntityPrefsMapper.class + "shiftRatings";
+
+ private static final Pattern DELIMITER = Pattern.compile("[\t,]");
+
+ private boolean booleanData;
+ private boolean transpose;
+ private final boolean itemKey;
+ private float ratingShift;
+
+ ToEntityPrefsMapper(boolean itemKey) {
+ this.itemKey = itemKey;
+ }
+
+ @Override
+ protected void setup(Context context) {
+ Configuration jobConf = context.getConfiguration();
+ booleanData = jobConf.getBoolean(RecommenderJob.BOOLEAN_DATA, false);
+ transpose = jobConf.getBoolean(TRANSPOSE_USER_ITEM, false);
+ ratingShift = Float.parseFloat(jobConf.get(RATING_SHIFT, "0.0"));
+ }
+
+ @Override
+ public void map(LongWritable key,
+ Text value,
+ Context context) throws IOException, InterruptedException {
+ String[] tokens = DELIMITER.split(value.toString());
+ long userID = Long.parseLong(tokens[0]);
+ long itemID = Long.parseLong(tokens[1]);
+ if (itemKey ^ transpose) {
+ // If using items as keys, and not transposing items and users, then users are items!
+ // Or if not using items as keys (users are, as usual), but transposing items and users,
+ // then users are items! Confused?
+ long temp = userID;
+ userID = itemID;
+ itemID = temp;
+ }
+ if (booleanData) {
+ context.write(new VarLongWritable(userID), new VarLongWritable(itemID));
+ } else {
+ float prefValue = tokens.length > 2 ? Float.parseFloat(tokens[2]) + ratingShift : 1.0f;
+ context.write(new VarLongWritable(userID), new EntityPrefWritable(itemID, prefValue));
+ }
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/ToItemPrefsMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/ToItemPrefsMapper.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/ToItemPrefsMapper.java
new file mode 100644
index 0000000..f5f9574
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/ToItemPrefsMapper.java
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop;
+
+/**
+ * <h1>Input</h1>
+ *
+ * <p>
+ * Intended for use with {@link org.apache.hadoop.mapreduce.lib.input.TextInputFormat};
+ * accepts line number / line pairs as
+ * {@link org.apache.hadoop.io.LongWritable}/{@link org.apache.hadoop.io.Text} pairs.
+ * </p>
+ *
+ * <p>
+ * Each line is assumed to be of the form {@code userID,itemID,preference}, or {@code userID,itemID}.
+ * </p>
+ *
+ * <h1>Output</h1>
+ *
+ * <p>
+ * Outputs the user ID as a {@link org.apache.mahout.math.VarLongWritable} mapped to the item ID and preference as a
+ * {@link EntityPrefWritable}.
+ * </p>
+ */
+public final class ToItemPrefsMapper extends ToEntityPrefsMapper {
+
+ public ToItemPrefsMapper() {
+ super(false);
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/TopItemsQueue.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/TopItemsQueue.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/TopItemsQueue.java
new file mode 100644
index 0000000..8f563b0
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/TopItemsQueue.java
@@ -0,0 +1,60 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+
+import org.apache.lucene.util.PriorityQueue;
+import org.apache.mahout.cf.taste.recommender.RecommendedItem;
+
+public class TopItemsQueue extends PriorityQueue<MutableRecommendedItem> {
+
+ private static final long SENTINEL_ID = Long.MIN_VALUE;
+
+ private final int maxSize;
+
+ public TopItemsQueue(int maxSize) {
+ super(maxSize);
+ this.maxSize = maxSize;
+ }
+
+ public List<RecommendedItem> getTopItems() {
+ List<RecommendedItem> recommendedItems = new ArrayList<>(maxSize);
+ while (size() > 0) {
+ MutableRecommendedItem topItem = pop();
+ // filter out "sentinel" objects necessary for maintaining an efficient priority queue
+ if (topItem.getItemID() != SENTINEL_ID) {
+ recommendedItems.add(topItem);
+ }
+ }
+ Collections.reverse(recommendedItems);
+ return recommendedItems;
+ }
+
+ @Override
+ protected boolean lessThan(MutableRecommendedItem one, MutableRecommendedItem two) {
+ return one.getValue() < two.getValue();
+ }
+
+ @Override
+ protected MutableRecommendedItem getSentinelObject() {
+ return new MutableRecommendedItem(SENTINEL_ID, Float.MIN_VALUE);
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/ALS.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/ALS.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/ALS.java
new file mode 100644
index 0000000..4bb95ae
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/ALS.java
@@ -0,0 +1,100 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.als;
+
+import com.google.common.base.Preconditions;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.LocalFileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
+import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterator;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+import org.apache.mahout.math.als.AlternatingLeastSquaresSolver;
+import org.apache.mahout.math.map.OpenIntObjectHashMap;
+
+final class ALS {
+
+ private ALS() {}
+
+ static Vector readFirstRow(Path dir, Configuration conf) throws IOException {
+ Iterator<VectorWritable> iterator = new SequenceFileDirValueIterator<>(dir, PathType.LIST,
+ PathFilters.partFilter(), null, true, conf);
+ return iterator.hasNext() ? iterator.next().get() : null;
+ }
+
+ public static OpenIntObjectHashMap<Vector> readMatrixByRowsFromDistributedCache(int numEntities,
+ Configuration conf) throws IOException {
+
+ IntWritable rowIndex = new IntWritable();
+ VectorWritable row = new VectorWritable();
+
+
+ OpenIntObjectHashMap<Vector> featureMatrix = numEntities > 0
+ ? new OpenIntObjectHashMap<Vector>(numEntities) : new OpenIntObjectHashMap<Vector>();
+
+ Path[] cachedFiles = HadoopUtil.getCachedFiles(conf);
+ LocalFileSystem localFs = FileSystem.getLocal(conf);
+
+ for (Path cachedFile : cachedFiles) {
+ try (SequenceFile.Reader reader = new SequenceFile.Reader(localFs.getConf(), SequenceFile.Reader.file(cachedFile))) {
+ while (reader.next(rowIndex, row)) {
+ featureMatrix.put(rowIndex.get(), row.get());
+ }
+ }
+ }
+
+ Preconditions.checkState(!featureMatrix.isEmpty(), "Feature matrix is empty");
+ return featureMatrix;
+ }
+
+ public static OpenIntObjectHashMap<Vector> readMatrixByRows(Path dir, Configuration conf) {
+ OpenIntObjectHashMap<Vector> matrix = new OpenIntObjectHashMap<>();
+ for (Pair<IntWritable,VectorWritable> pair
+ : new SequenceFileDirIterable<IntWritable,VectorWritable>(dir, PathType.LIST, PathFilters.partFilter(), conf)) {
+ int rowIndex = pair.getFirst().get();
+ Vector row = pair.getSecond().get();
+ matrix.put(rowIndex, row);
+ }
+ return matrix;
+ }
+
+ public static Vector solveExplicit(VectorWritable ratingsWritable, OpenIntObjectHashMap<Vector> uOrM,
+ double lambda, int numFeatures) {
+ Vector ratings = ratingsWritable.get();
+
+ List<Vector> featureVectors = new ArrayList<>(ratings.getNumNondefaultElements());
+ for (Vector.Element e : ratings.nonZeroes()) {
+ int index = e.index();
+ featureVectors.add(uOrM.get(index));
+ }
+
+ return AlternatingLeastSquaresSolver.solve(featureVectors, ratings, lambda, numFeatures);
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/DatasetSplitter.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/DatasetSplitter.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/DatasetSplitter.java
new file mode 100644
index 0000000..b061a63
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/DatasetSplitter.java
@@ -0,0 +1,158 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.als;
+
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.RandomUtils;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
+
+/**
+ * <p>Split a recommendation dataset into a training and a test set</p>
+ *
+ * <p>Command line arguments specific to this class are:</p>
+ *
+ * <ol>
+ * <li>--input (path): Directory containing one or more text files with the dataset</li>
+ * <li>--output (path): path where output should go</li>
+ * <li>--trainingPercentage (double): percentage of the data to use as training set (optional, default 0.9)</li>
+ * <li>--probePercentage (double): percentage of the data to use as probe set (optional, default 0.1)</li>
+ * </ol>
+ */
+public class DatasetSplitter extends AbstractJob {
+
+ private static final String TRAINING_PERCENTAGE = DatasetSplitter.class.getName() + ".trainingPercentage";
+ private static final String PROBE_PERCENTAGE = DatasetSplitter.class.getName() + ".probePercentage";
+ private static final String PART_TO_USE = DatasetSplitter.class.getName() + ".partToUse";
+
+ private static final Text INTO_TRAINING_SET = new Text("T");
+ private static final Text INTO_PROBE_SET = new Text("P");
+
+ private static final double DEFAULT_TRAINING_PERCENTAGE = 0.9;
+ private static final double DEFAULT_PROBE_PERCENTAGE = 0.1;
+
+ public static void main(String[] args) throws Exception {
+ ToolRunner.run(new DatasetSplitter(), args);
+ }
+
+ @Override
+ public int run(String[] args) throws Exception {
+
+ addInputOption();
+ addOutputOption();
+ addOption("trainingPercentage", "t", "percentage of the data to use as training set (default: "
+ + DEFAULT_TRAINING_PERCENTAGE + ')', String.valueOf(DEFAULT_TRAINING_PERCENTAGE));
+ addOption("probePercentage", "p", "percentage of the data to use as probe set (default: "
+ + DEFAULT_PROBE_PERCENTAGE + ')', String.valueOf(DEFAULT_PROBE_PERCENTAGE));
+
+ Map<String,List<String>> parsedArgs = parseArguments(args);
+ if (parsedArgs == null) {
+ return -1;
+ }
+
+ double trainingPercentage = Double.parseDouble(getOption("trainingPercentage"));
+ double probePercentage = Double.parseDouble(getOption("probePercentage"));
+ String tempDir = getOption("tempDir");
+
+ Path markedPrefs = new Path(tempDir, "markedPreferences");
+ Path trainingSetPath = new Path(getOutputPath(), "trainingSet");
+ Path probeSetPath = new Path(getOutputPath(), "probeSet");
+
+ Job markPreferences = prepareJob(getInputPath(), markedPrefs, TextInputFormat.class, MarkPreferencesMapper.class,
+ Text.class, Text.class, SequenceFileOutputFormat.class);
+ markPreferences.getConfiguration().set(TRAINING_PERCENTAGE, String.valueOf(trainingPercentage));
+ markPreferences.getConfiguration().set(PROBE_PERCENTAGE, String.valueOf(probePercentage));
+ boolean succeeded = markPreferences.waitForCompletion(true);
+ if (!succeeded) {
+ return -1;
+ }
+
+ Job createTrainingSet = prepareJob(markedPrefs, trainingSetPath, SequenceFileInputFormat.class,
+ WritePrefsMapper.class, NullWritable.class, Text.class, TextOutputFormat.class);
+ createTrainingSet.getConfiguration().set(PART_TO_USE, INTO_TRAINING_SET.toString());
+ succeeded = createTrainingSet.waitForCompletion(true);
+ if (!succeeded) {
+ return -1;
+ }
+
+ Job createProbeSet = prepareJob(markedPrefs, probeSetPath, SequenceFileInputFormat.class,
+ WritePrefsMapper.class, NullWritable.class, Text.class, TextOutputFormat.class);
+ createProbeSet.getConfiguration().set(PART_TO_USE, INTO_PROBE_SET.toString());
+ succeeded = createProbeSet.waitForCompletion(true);
+ if (!succeeded) {
+ return -1;
+ }
+
+ return 0;
+ }
+
+ static class MarkPreferencesMapper extends Mapper<LongWritable,Text,Text,Text> {
+
+ private Random random;
+ private double trainingBound;
+ private double probeBound;
+
+ @Override
+ protected void setup(Context ctx) throws IOException, InterruptedException {
+ random = RandomUtils.getRandom();
+ trainingBound = Double.parseDouble(ctx.getConfiguration().get(TRAINING_PERCENTAGE));
+ probeBound = trainingBound + Double.parseDouble(ctx.getConfiguration().get(PROBE_PERCENTAGE));
+ }
+
+ @Override
+ protected void map(LongWritable key, Text text, Context ctx) throws IOException, InterruptedException {
+ double randomValue = random.nextDouble();
+ if (randomValue <= trainingBound) {
+ ctx.write(INTO_TRAINING_SET, text);
+ } else if (randomValue <= probeBound) {
+ ctx.write(INTO_PROBE_SET, text);
+ }
+ }
+ }
+
+ static class WritePrefsMapper extends Mapper<Text,Text,NullWritable,Text> {
+
+ private String partToUse;
+
+ @Override
+ protected void setup(Context ctx) throws IOException, InterruptedException {
+ partToUse = ctx.getConfiguration().get(PART_TO_USE);
+ }
+
+ @Override
+ protected void map(Text key, Text text, Context ctx) throws IOException, InterruptedException {
+ if (partToUse.equals(key.toString())) {
+ ctx.write(NullWritable.get(), text);
+ }
+ }
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/FactorizationEvaluator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/FactorizationEvaluator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/FactorizationEvaluator.java
new file mode 100644
index 0000000..4e6aaf5
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/FactorizationEvaluator.java
@@ -0,0 +1,166 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.als;
+
+import java.io.BufferedWriter;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.commons.io.Charsets;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.DoubleWritable;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.cf.taste.hadoop.TasteHadoopUtils;
+import org.apache.mahout.cf.taste.impl.common.FullRunningAverage;
+import org.apache.mahout.cf.taste.impl.common.RunningAverage;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
+import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.map.OpenIntObjectHashMap;
+
+/**
+ * <p>Measures the root-mean-squared error of a rating matrix factorization against a test set.</p>
+ *
+ * <p>Command line arguments specific to this class are:</p>
+ *
+ * <ol>
+ * <li>--output (path): path where output should go</li>
+ * <li>--pairs (path): path containing the test ratings, each line must be userID,itemID,rating</li>
+ * <li>--userFeatures (path): path to the user feature matrix</li>
+ * <li>--itemFeatures (path): path to the item feature matrix</li>
+ * </ol>
+ */
+public class FactorizationEvaluator extends AbstractJob {
+
+ private static final String USER_FEATURES_PATH = RecommenderJob.class.getName() + ".userFeatures";
+ private static final String ITEM_FEATURES_PATH = RecommenderJob.class.getName() + ".itemFeatures";
+
+ public static void main(String[] args) throws Exception {
+ ToolRunner.run(new FactorizationEvaluator(), args);
+ }
+
+ @Override
+ public int run(String[] args) throws Exception {
+
+ addInputOption();
+ addOption("userFeatures", null, "path to the user feature matrix", true);
+ addOption("itemFeatures", null, "path to the item feature matrix", true);
+ addOption("usesLongIDs", null, "input contains long IDs that need to be translated");
+ addOutputOption();
+
+ Map<String,List<String>> parsedArgs = parseArguments(args);
+ if (parsedArgs == null) {
+ return -1;
+ }
+
+ Path errors = getTempPath("errors");
+
+ Job predictRatings = prepareJob(getInputPath(), errors, TextInputFormat.class, PredictRatingsMapper.class,
+ DoubleWritable.class, NullWritable.class, SequenceFileOutputFormat.class);
+
+ Configuration conf = predictRatings.getConfiguration();
+ conf.set(USER_FEATURES_PATH, getOption("userFeatures"));
+ conf.set(ITEM_FEATURES_PATH, getOption("itemFeatures"));
+
+ boolean usesLongIDs = Boolean.parseBoolean(getOption("usesLongIDs"));
+ if (usesLongIDs) {
+ conf.set(ParallelALSFactorizationJob.USES_LONG_IDS, String.valueOf(true));
+ }
+
+
+ boolean succeeded = predictRatings.waitForCompletion(true);
+ if (!succeeded) {
+ return -1;
+ }
+
+ FileSystem fs = FileSystem.get(getOutputPath().toUri(), getConf());
+ FSDataOutputStream outputStream = fs.create(getOutputPath("rmse.txt"));
+ try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(outputStream, Charsets.UTF_8))){
+ double rmse = computeRmse(errors);
+ writer.write(String.valueOf(rmse));
+ }
+ return 0;
+ }
+
+ private double computeRmse(Path errors) {
+ RunningAverage average = new FullRunningAverage();
+ for (Pair<DoubleWritable,NullWritable> entry
+ : new SequenceFileDirIterable<DoubleWritable, NullWritable>(errors, PathType.LIST, PathFilters.logsCRCFilter(),
+ getConf())) {
+ DoubleWritable error = entry.getFirst();
+ average.addDatum(error.get() * error.get());
+ }
+
+ return Math.sqrt(average.getAverage());
+ }
+
+ public static class PredictRatingsMapper extends Mapper<LongWritable,Text,DoubleWritable,NullWritable> {
+
+ private OpenIntObjectHashMap<Vector> U;
+ private OpenIntObjectHashMap<Vector> M;
+
+ private boolean usesLongIDs;
+
+ private final DoubleWritable error = new DoubleWritable();
+
+ @Override
+ protected void setup(Context ctx) throws IOException, InterruptedException {
+ Configuration conf = ctx.getConfiguration();
+
+ Path pathToU = new Path(conf.get(USER_FEATURES_PATH));
+ Path pathToM = new Path(conf.get(ITEM_FEATURES_PATH));
+
+ U = ALS.readMatrixByRows(pathToU, conf);
+ M = ALS.readMatrixByRows(pathToM, conf);
+
+ usesLongIDs = conf.getBoolean(ParallelALSFactorizationJob.USES_LONG_IDS, false);
+ }
+
+ @Override
+ protected void map(LongWritable key, Text value, Context ctx) throws IOException, InterruptedException {
+
+ String[] tokens = TasteHadoopUtils.splitPrefTokens(value.toString());
+
+ int userID = TasteHadoopUtils.readID(tokens[TasteHadoopUtils.USER_ID_POS], usesLongIDs);
+ int itemID = TasteHadoopUtils.readID(tokens[TasteHadoopUtils.ITEM_ID_POS], usesLongIDs);
+ double rating = Double.parseDouble(tokens[2]);
+
+ if (U.containsKey(userID) && M.containsKey(itemID)) {
+ double estimate = U.get(userID).dot(M.get(itemID));
+ error.set(rating - estimate);
+ ctx.write(error, NullWritable.get());
+ }
+ }
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/MultithreadedSharingMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/MultithreadedSharingMapper.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/MultithreadedSharingMapper.java
new file mode 100644
index 0000000..d93e3a4
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/MultithreadedSharingMapper.java
@@ -0,0 +1,62 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.als;
+
+import com.google.common.base.Preconditions;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.mapreduce.JobContext;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.lib.map.MultithreadedMapper;
+import org.apache.hadoop.util.ReflectionUtils;
+
+import java.io.IOException;
+
+/**
+ * Multithreaded Mapper for {@link SharingMapper}s. Will call setupSharedInstance() once in the controlling thread
+ * before executing the mappers using a thread pool.
+ *
+ * @param <K1>
+ * @param <V1>
+ * @param <K2>
+ * @param <V2>
+ */
+public class MultithreadedSharingMapper<K1, V1, K2, V2> extends MultithreadedMapper<K1, V1, K2, V2> {
+
+ @Override
+ public void run(Context ctx) throws IOException, InterruptedException {
+ Class<Mapper<K1, V1, K2, V2>> mapperClass =
+ MultithreadedSharingMapper.getMapperClass((JobContext) ctx);
+ Preconditions.checkNotNull(mapperClass, "Could not find Multithreaded Mapper class.");
+
+ Configuration conf = ctx.getConfiguration();
+ // instantiate the mapper
+ Mapper<K1, V1, K2, V2> mapper1 = ReflectionUtils.newInstance(mapperClass, conf);
+ SharingMapper<K1, V1, K2, V2, ?> mapper = null;
+ if (mapper1 instanceof SharingMapper) {
+ mapper = (SharingMapper<K1, V1, K2, V2, ?>) mapper1;
+ }
+ Preconditions.checkNotNull(mapper, "Could not instantiate SharingMapper. Class was: %s",
+ mapper1.getClass().getName());
+
+ // single threaded call to setup the sharing mapper
+ mapper.setupSharedInstance(ctx);
+
+ // multithreaded execution
+ super.run(ctx);
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/ParallelALSFactorizationJob.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/ParallelALSFactorizationJob.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/ParallelALSFactorizationJob.java
new file mode 100644
index 0000000..2ce9b61
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/ParallelALSFactorizationJob.java
@@ -0,0 +1,414 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.als;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
+
+import com.google.common.base.Preconditions;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.filecache.DistributedCache;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.WritableComparable;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
+import org.apache.hadoop.mapreduce.lib.map.MultithreadedMapper;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.cf.taste.hadoop.TasteHadoopUtils;
+import org.apache.mahout.cf.taste.impl.common.FullRunningAverage;
+import org.apache.mahout.cf.taste.impl.common.RunningAverage;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
+import org.apache.mahout.common.mapreduce.MergeVectorsCombiner;
+import org.apache.mahout.common.mapreduce.MergeVectorsReducer;
+import org.apache.mahout.common.mapreduce.TransposeMapper;
+import org.apache.mahout.common.mapreduce.VectorSumCombiner;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.RandomAccessSparseVector;
+import org.apache.mahout.math.SequentialAccessSparseVector;
+import org.apache.mahout.math.VarIntWritable;
+import org.apache.mahout.math.VarLongWritable;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+import org.apache.mahout.math.hadoop.similarity.cooccurrence.Vectors;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * <p>MapReduce implementation of the two factorization algorithms described in
+ *
+ * <p>"Large-scale Parallel Collaborative Filtering for the Netflix Prize" available at
+ * http://www.hpl.hp.com/personal/Robert_Schreiber/papers/2008%20AAIM%20Netflix/netflix_aaim08(submitted).pdf.</p>
+ *
+ * "<p>Collaborative Filtering for Implicit Feedback Datasets" available at
+ * http://research.yahoo.com/pub/2433</p>
+ *
+ * </p>
+ * <p>Command line arguments specific to this class are:</p>
+ *
+ * <ol>
+ * <li>--input (path): Directory containing one or more text files with the dataset</li>
+ * <li>--output (path): path where output should go</li>
+ * <li>--lambda (double): regularization parameter to avoid overfitting</li>
+ * <li>--userFeatures (path): path to the user feature matrix</li>
+ * <li>--itemFeatures (path): path to the item feature matrix</li>
+ * <li>--numThreadsPerSolver (int): threads to use per solver mapper, (default: 1)</li>
+ * </ol>
+ */
+public class ParallelALSFactorizationJob extends AbstractJob {
+
+ private static final Logger log = LoggerFactory.getLogger(ParallelALSFactorizationJob.class);
+
+ static final String NUM_FEATURES = ParallelALSFactorizationJob.class.getName() + ".numFeatures";
+ static final String LAMBDA = ParallelALSFactorizationJob.class.getName() + ".lambda";
+ static final String ALPHA = ParallelALSFactorizationJob.class.getName() + ".alpha";
+ static final String NUM_ENTITIES = ParallelALSFactorizationJob.class.getName() + ".numEntities";
+
+ static final String USES_LONG_IDS = ParallelALSFactorizationJob.class.getName() + ".usesLongIDs";
+ static final String TOKEN_POS = ParallelALSFactorizationJob.class.getName() + ".tokenPos";
+
+ private boolean implicitFeedback;
+ private int numIterations;
+ private int numFeatures;
+ private double lambda;
+ private double alpha;
+ private int numThreadsPerSolver;
+
+ enum Stats { NUM_USERS }
+
+ public static void main(String[] args) throws Exception {
+ ToolRunner.run(new ParallelALSFactorizationJob(), args);
+ }
+
+ @Override
+ public int run(String[] args) throws Exception {
+
+ addInputOption();
+ addOutputOption();
+ addOption("lambda", null, "regularization parameter", true);
+ addOption("implicitFeedback", null, "data consists of implicit feedback?", String.valueOf(false));
+ addOption("alpha", null, "confidence parameter (only used on implicit feedback)", String.valueOf(40));
+ addOption("numFeatures", null, "dimension of the feature space", true);
+ addOption("numIterations", null, "number of iterations", true);
+ addOption("numThreadsPerSolver", null, "threads per solver mapper", String.valueOf(1));
+ addOption("usesLongIDs", null, "input contains long IDs that need to be translated");
+
+ Map<String,List<String>> parsedArgs = parseArguments(args);
+ if (parsedArgs == null) {
+ return -1;
+ }
+
+ numFeatures = Integer.parseInt(getOption("numFeatures"));
+ numIterations = Integer.parseInt(getOption("numIterations"));
+ lambda = Double.parseDouble(getOption("lambda"));
+ alpha = Double.parseDouble(getOption("alpha"));
+ implicitFeedback = Boolean.parseBoolean(getOption("implicitFeedback"));
+
+ numThreadsPerSolver = Integer.parseInt(getOption("numThreadsPerSolver"));
+ boolean usesLongIDs = Boolean.parseBoolean(getOption("usesLongIDs", String.valueOf(false)));
+
+ /*
+ * compute the factorization A = U M'
+ *
+ * where A (users x items) is the matrix of known ratings
+ * U (users x features) is the representation of users in the feature space
+ * M (items x features) is the representation of items in the feature space
+ */
+
+ if (usesLongIDs) {
+ Job mapUsers = prepareJob(getInputPath(), getOutputPath("userIDIndex"), TextInputFormat.class,
+ MapLongIDsMapper.class, VarIntWritable.class, VarLongWritable.class, IDMapReducer.class,
+ VarIntWritable.class, VarLongWritable.class, SequenceFileOutputFormat.class);
+ mapUsers.getConfiguration().set(TOKEN_POS, String.valueOf(TasteHadoopUtils.USER_ID_POS));
+ mapUsers.waitForCompletion(true);
+
+ Job mapItems = prepareJob(getInputPath(), getOutputPath("itemIDIndex"), TextInputFormat.class,
+ MapLongIDsMapper.class, VarIntWritable.class, VarLongWritable.class, IDMapReducer.class,
+ VarIntWritable.class, VarLongWritable.class, SequenceFileOutputFormat.class);
+ mapItems.getConfiguration().set(TOKEN_POS, String.valueOf(TasteHadoopUtils.ITEM_ID_POS));
+ mapItems.waitForCompletion(true);
+ }
+
+ /* create A' */
+ Job itemRatings = prepareJob(getInputPath(), pathToItemRatings(),
+ TextInputFormat.class, ItemRatingVectorsMapper.class, IntWritable.class,
+ VectorWritable.class, VectorSumReducer.class, IntWritable.class,
+ VectorWritable.class, SequenceFileOutputFormat.class);
+ itemRatings.setCombinerClass(VectorSumCombiner.class);
+ itemRatings.getConfiguration().set(USES_LONG_IDS, String.valueOf(usesLongIDs));
+ boolean succeeded = itemRatings.waitForCompletion(true);
+ if (!succeeded) {
+ return -1;
+ }
+
+ /* create A */
+ Job userRatings = prepareJob(pathToItemRatings(), pathToUserRatings(),
+ TransposeMapper.class, IntWritable.class, VectorWritable.class, MergeUserVectorsReducer.class,
+ IntWritable.class, VectorWritable.class);
+ userRatings.setCombinerClass(MergeVectorsCombiner.class);
+ succeeded = userRatings.waitForCompletion(true);
+ if (!succeeded) {
+ return -1;
+ }
+
+ //TODO this could be fiddled into one of the upper jobs
+ Job averageItemRatings = prepareJob(pathToItemRatings(), getTempPath("averageRatings"),
+ AverageRatingMapper.class, IntWritable.class, VectorWritable.class, MergeVectorsReducer.class,
+ IntWritable.class, VectorWritable.class);
+ averageItemRatings.setCombinerClass(MergeVectorsCombiner.class);
+ succeeded = averageItemRatings.waitForCompletion(true);
+ if (!succeeded) {
+ return -1;
+ }
+
+ Vector averageRatings = ALS.readFirstRow(getTempPath("averageRatings"), getConf());
+
+ int numItems = averageRatings.getNumNondefaultElements();
+ int numUsers = (int) userRatings.getCounters().findCounter(Stats.NUM_USERS).getValue();
+
+ log.info("Found {} users and {} items", numUsers, numItems);
+
+ /* create an initial M */
+ initializeM(averageRatings);
+
+ for (int currentIteration = 0; currentIteration < numIterations; currentIteration++) {
+ /* broadcast M, read A row-wise, recompute U row-wise */
+ log.info("Recomputing U (iteration {}/{})", currentIteration, numIterations);
+ runSolver(pathToUserRatings(), pathToU(currentIteration), pathToM(currentIteration - 1), currentIteration, "U",
+ numItems);
+ /* broadcast U, read A' row-wise, recompute M row-wise */
+ log.info("Recomputing M (iteration {}/{})", currentIteration, numIterations);
+ runSolver(pathToItemRatings(), pathToM(currentIteration), pathToU(currentIteration), currentIteration, "M",
+ numUsers);
+ }
+
+ return 0;
+ }
+
+ private void initializeM(Vector averageRatings) throws IOException {
+ Random random = RandomUtils.getRandom();
+
+ FileSystem fs = FileSystem.get(pathToM(-1).toUri(), getConf());
+ try (SequenceFile.Writer writer =
+ new SequenceFile.Writer(fs, getConf(), new Path(pathToM(-1), "part-m-00000"),
+ IntWritable.class, VectorWritable.class)) {
+ IntWritable index = new IntWritable();
+ VectorWritable featureVector = new VectorWritable();
+
+ for (Vector.Element e : averageRatings.nonZeroes()) {
+ Vector row = new DenseVector(numFeatures);
+ row.setQuick(0, e.get());
+ for (int m = 1; m < numFeatures; m++) {
+ row.setQuick(m, random.nextDouble());
+ }
+ index.set(e.index());
+ featureVector.set(row);
+ writer.append(index, featureVector);
+ }
+ }
+ }
+
+ static class VectorSumReducer
+ extends Reducer<WritableComparable<?>, VectorWritable, WritableComparable<?>, VectorWritable> {
+
+ private final VectorWritable result = new VectorWritable();
+
+ @Override
+ protected void reduce(WritableComparable<?> key, Iterable<VectorWritable> values, Context ctx)
+ throws IOException, InterruptedException {
+ Vector sum = Vectors.sum(values.iterator());
+ result.set(new SequentialAccessSparseVector(sum));
+ ctx.write(key, result);
+ }
+ }
+
+ static class MergeUserVectorsReducer extends
+ Reducer<WritableComparable<?>,VectorWritable,WritableComparable<?>,VectorWritable> {
+
+ private final VectorWritable result = new VectorWritable();
+
+ @Override
+ public void reduce(WritableComparable<?> key, Iterable<VectorWritable> vectors, Context ctx)
+ throws IOException, InterruptedException {
+ Vector merged = VectorWritable.merge(vectors.iterator()).get();
+ result.set(new SequentialAccessSparseVector(merged));
+ ctx.write(key, result);
+ ctx.getCounter(Stats.NUM_USERS).increment(1);
+ }
+ }
+
+ static class ItemRatingVectorsMapper extends Mapper<LongWritable,Text,IntWritable,VectorWritable> {
+
+ private final IntWritable itemIDWritable = new IntWritable();
+ private final VectorWritable ratingsWritable = new VectorWritable(true);
+ private final Vector ratings = new RandomAccessSparseVector(Integer.MAX_VALUE, 1);
+
+ private boolean usesLongIDs;
+
+ @Override
+ protected void setup(Context ctx) throws IOException, InterruptedException {
+ usesLongIDs = ctx.getConfiguration().getBoolean(USES_LONG_IDS, false);
+ }
+
+ @Override
+ protected void map(LongWritable offset, Text line, Context ctx) throws IOException, InterruptedException {
+ String[] tokens = TasteHadoopUtils.splitPrefTokens(line.toString());
+ int userID = TasteHadoopUtils.readID(tokens[TasteHadoopUtils.USER_ID_POS], usesLongIDs);
+ int itemID = TasteHadoopUtils.readID(tokens[TasteHadoopUtils.ITEM_ID_POS], usesLongIDs);
+ float rating = Float.parseFloat(tokens[2]);
+
+ ratings.setQuick(userID, rating);
+
+ itemIDWritable.set(itemID);
+ ratingsWritable.set(ratings);
+
+ ctx.write(itemIDWritable, ratingsWritable);
+
+ // prepare instance for reuse
+ ratings.setQuick(userID, 0.0d);
+ }
+ }
+
+ private void runSolver(Path ratings, Path output, Path pathToUorM, int currentIteration, String matrixName,
+ int numEntities) throws ClassNotFoundException, IOException, InterruptedException {
+
+ // necessary for local execution in the same JVM only
+ SharingMapper.reset();
+
+ Class<? extends Mapper<IntWritable,VectorWritable,IntWritable,VectorWritable>> solverMapperClassInternal;
+ String name;
+
+ if (implicitFeedback) {
+ solverMapperClassInternal = SolveImplicitFeedbackMapper.class;
+ name = "Recompute " + matrixName + ", iteration (" + currentIteration + '/' + numIterations + "), "
+ + '(' + numThreadsPerSolver + " threads, " + numFeatures + " features, implicit feedback)";
+ } else {
+ solverMapperClassInternal = SolveExplicitFeedbackMapper.class;
+ name = "Recompute " + matrixName + ", iteration (" + currentIteration + '/' + numIterations + "), "
+ + '(' + numThreadsPerSolver + " threads, " + numFeatures + " features, explicit feedback)";
+ }
+
+ Job solverForUorI = prepareJob(ratings, output, SequenceFileInputFormat.class, MultithreadedSharingMapper.class,
+ IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class, name);
+ Configuration solverConf = solverForUorI.getConfiguration();
+ solverConf.set(LAMBDA, String.valueOf(lambda));
+ solverConf.set(ALPHA, String.valueOf(alpha));
+ solverConf.setInt(NUM_FEATURES, numFeatures);
+ solverConf.set(NUM_ENTITIES, String.valueOf(numEntities));
+
+ FileSystem fs = FileSystem.get(pathToUorM.toUri(), solverConf);
+ FileStatus[] parts = fs.listStatus(pathToUorM, PathFilters.partFilter());
+ for (FileStatus part : parts) {
+ if (log.isDebugEnabled()) {
+ log.debug("Adding {} to distributed cache", part.getPath().toString());
+ }
+ DistributedCache.addCacheFile(part.getPath().toUri(), solverConf);
+ }
+
+ MultithreadedMapper.setMapperClass(solverForUorI, solverMapperClassInternal);
+ MultithreadedMapper.setNumberOfThreads(solverForUorI, numThreadsPerSolver);
+
+ boolean succeeded = solverForUorI.waitForCompletion(true);
+ if (!succeeded) {
+ throw new IllegalStateException("Job failed!");
+ }
+ }
+
+ static class AverageRatingMapper extends Mapper<IntWritable,VectorWritable,IntWritable,VectorWritable> {
+
+ private final IntWritable firstIndex = new IntWritable(0);
+ private final Vector featureVector = new RandomAccessSparseVector(Integer.MAX_VALUE, 1);
+ private final VectorWritable featureVectorWritable = new VectorWritable();
+
+ @Override
+ protected void map(IntWritable r, VectorWritable v, Context ctx) throws IOException, InterruptedException {
+ RunningAverage avg = new FullRunningAverage();
+ for (Vector.Element e : v.get().nonZeroes()) {
+ avg.addDatum(e.get());
+ }
+
+ featureVector.setQuick(r.get(), avg.getAverage());
+ featureVectorWritable.set(featureVector);
+ ctx.write(firstIndex, featureVectorWritable);
+
+ // prepare instance for reuse
+ featureVector.setQuick(r.get(), 0.0d);
+ }
+ }
+
+ static class MapLongIDsMapper extends Mapper<LongWritable,Text,VarIntWritable,VarLongWritable> {
+
+ private int tokenPos;
+ private final VarIntWritable index = new VarIntWritable();
+ private final VarLongWritable idWritable = new VarLongWritable();
+
+ @Override
+ protected void setup(Context ctx) throws IOException, InterruptedException {
+ tokenPos = ctx.getConfiguration().getInt(TOKEN_POS, -1);
+ Preconditions.checkState(tokenPos >= 0);
+ }
+
+ @Override
+ protected void map(LongWritable key, Text line, Context ctx) throws IOException, InterruptedException {
+ String[] tokens = TasteHadoopUtils.splitPrefTokens(line.toString());
+
+ long id = Long.parseLong(tokens[tokenPos]);
+
+ index.set(TasteHadoopUtils.idToIndex(id));
+ idWritable.set(id);
+ ctx.write(index, idWritable);
+ }
+ }
+
+ static class IDMapReducer extends Reducer<VarIntWritable,VarLongWritable,VarIntWritable,VarLongWritable> {
+ @Override
+ protected void reduce(VarIntWritable index, Iterable<VarLongWritable> ids, Context ctx)
+ throws IOException, InterruptedException {
+ ctx.write(index, ids.iterator().next());
+ }
+ }
+
+ private Path pathToM(int iteration) {
+ return iteration == numIterations - 1 ? getOutputPath("M") : getTempPath("M-" + iteration);
+ }
+
+ private Path pathToU(int iteration) {
+ return iteration == numIterations - 1 ? getOutputPath("U") : getTempPath("U-" + iteration);
+ }
+
+ private Path pathToItemRatings() {
+ return getTempPath("itemRatings");
+ }
+
+ private Path pathToUserRatings() {
+ return getOutputPath("userRatings");
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/PredictionMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/PredictionMapper.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/PredictionMapper.java
new file mode 100644
index 0000000..6e7ea81
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/PredictionMapper.java
@@ -0,0 +1,145 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.als;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.mahout.cf.taste.hadoop.MutableRecommendedItem;
+import org.apache.mahout.cf.taste.hadoop.RecommendedItemsWritable;
+import org.apache.mahout.cf.taste.hadoop.TasteHadoopUtils;
+import org.apache.mahout.cf.taste.hadoop.TopItemsQueue;
+import org.apache.mahout.cf.taste.recommender.RecommendedItem;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+import org.apache.mahout.math.function.IntObjectProcedure;
+import org.apache.mahout.math.map.OpenIntLongHashMap;
+import org.apache.mahout.math.map.OpenIntObjectHashMap;
+import org.apache.mahout.math.set.OpenIntHashSet;
+
+import java.io.IOException;
+import java.util.List;
+
+/**
+ * a multithreaded mapper that loads the feature matrices U and M into memory. Afterwards it computes recommendations
+ * from these. Can be executed by a {@link MultithreadedSharingMapper}.
+ */
+public class PredictionMapper extends SharingMapper<IntWritable,VectorWritable,LongWritable,RecommendedItemsWritable,
+ Pair<OpenIntObjectHashMap<Vector>,OpenIntObjectHashMap<Vector>>> {
+
+ private int recommendationsPerUser;
+ private float maxRating;
+
+ private boolean usesLongIDs;
+ private OpenIntLongHashMap userIDIndex;
+ private OpenIntLongHashMap itemIDIndex;
+
+ private final LongWritable userIDWritable = new LongWritable();
+ private final RecommendedItemsWritable recommendations = new RecommendedItemsWritable();
+
+ @Override
+ Pair<OpenIntObjectHashMap<Vector>, OpenIntObjectHashMap<Vector>> createSharedInstance(Context ctx) {
+ Configuration conf = ctx.getConfiguration();
+ Path pathToU = new Path(conf.get(RecommenderJob.USER_FEATURES_PATH));
+ Path pathToM = new Path(conf.get(RecommenderJob.ITEM_FEATURES_PATH));
+
+ OpenIntObjectHashMap<Vector> U = ALS.readMatrixByRows(pathToU, conf);
+ OpenIntObjectHashMap<Vector> M = ALS.readMatrixByRows(pathToM, conf);
+
+ return new Pair<>(U, M);
+ }
+
+ @Override
+ protected void setup(Context ctx) throws IOException, InterruptedException {
+ Configuration conf = ctx.getConfiguration();
+ recommendationsPerUser = conf.getInt(RecommenderJob.NUM_RECOMMENDATIONS,
+ RecommenderJob.DEFAULT_NUM_RECOMMENDATIONS);
+ maxRating = Float.parseFloat(conf.get(RecommenderJob.MAX_RATING));
+
+ usesLongIDs = conf.getBoolean(ParallelALSFactorizationJob.USES_LONG_IDS, false);
+ if (usesLongIDs) {
+ userIDIndex = TasteHadoopUtils.readIDIndexMap(conf.get(RecommenderJob.USER_INDEX_PATH), conf);
+ itemIDIndex = TasteHadoopUtils.readIDIndexMap(conf.get(RecommenderJob.ITEM_INDEX_PATH), conf);
+ }
+ }
+
+ @Override
+ protected void map(IntWritable userIndexWritable, VectorWritable ratingsWritable, Context ctx)
+ throws IOException, InterruptedException {
+
+ Pair<OpenIntObjectHashMap<Vector>, OpenIntObjectHashMap<Vector>> uAndM = getSharedInstance();
+ OpenIntObjectHashMap<Vector> U = uAndM.getFirst();
+ OpenIntObjectHashMap<Vector> M = uAndM.getSecond();
+
+ Vector ratings = ratingsWritable.get();
+ int userIndex = userIndexWritable.get();
+ final OpenIntHashSet alreadyRatedItems = new OpenIntHashSet(ratings.getNumNondefaultElements());
+
+ for (Vector.Element e : ratings.nonZeroes()) {
+ alreadyRatedItems.add(e.index());
+ }
+
+ final TopItemsQueue topItemsQueue = new TopItemsQueue(recommendationsPerUser);
+ final Vector userFeatures = U.get(userIndex);
+
+ M.forEachPair(new IntObjectProcedure<Vector>() {
+ @Override
+ public boolean apply(int itemID, Vector itemFeatures) {
+ if (!alreadyRatedItems.contains(itemID)) {
+ double predictedRating = userFeatures.dot(itemFeatures);
+
+ MutableRecommendedItem top = topItemsQueue.top();
+ if (predictedRating > top.getValue()) {
+ top.set(itemID, (float) predictedRating);
+ topItemsQueue.updateTop();
+ }
+ }
+ return true;
+ }
+ });
+
+ List<RecommendedItem> recommendedItems = topItemsQueue.getTopItems();
+
+ if (!recommendedItems.isEmpty()) {
+
+ // cap predictions to maxRating
+ for (RecommendedItem topItem : recommendedItems) {
+ ((MutableRecommendedItem) topItem).capToMaxValue(maxRating);
+ }
+
+ if (usesLongIDs) {
+ long userID = userIDIndex.get(userIndex);
+ userIDWritable.set(userID);
+
+ for (RecommendedItem topItem : recommendedItems) {
+ // remap item IDs
+ long itemID = itemIDIndex.get((int) topItem.getItemID());
+ ((MutableRecommendedItem) topItem).setItemID(itemID);
+ }
+
+ } else {
+ userIDWritable.set(userIndex);
+ }
+
+ recommendations.set(recommendedItems);
+ ctx.write(userIDWritable, recommendations);
+ }
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/RecommenderJob.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/RecommenderJob.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/RecommenderJob.java
new file mode 100644
index 0000000..679d227
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/RecommenderJob.java
@@ -0,0 +1,110 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.als;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.map.MultithreadedMapper;
+import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.cf.taste.hadoop.RecommendedItemsWritable;
+import org.apache.mahout.common.AbstractJob;
+
+import java.util.List;
+import java.util.Map;
+
+/**
+ * <p>Computes the top-N recommendations per user from a decomposition of the rating matrix</p>
+ *
+ * <p>Command line arguments specific to this class are:</p>
+ *
+ * <ol>
+ * <li>--input (path): Directory containing the vectorized user ratings</li>
+ * <li>--output (path): path where output should go</li>
+ * <li>--numRecommendations (int): maximum number of recommendations per user (default: 10)</li>
+ * <li>--maxRating (double): maximum rating of an item</li>
+ * <li>--numThreads (int): threads to use per mapper, (default: 1)</li>
+ * </ol>
+ */
+public class RecommenderJob extends AbstractJob {
+
+ static final String NUM_RECOMMENDATIONS = RecommenderJob.class.getName() + ".numRecommendations";
+ static final String USER_FEATURES_PATH = RecommenderJob.class.getName() + ".userFeatures";
+ static final String ITEM_FEATURES_PATH = RecommenderJob.class.getName() + ".itemFeatures";
+ static final String MAX_RATING = RecommenderJob.class.getName() + ".maxRating";
+ static final String USER_INDEX_PATH = RecommenderJob.class.getName() + ".userIndex";
+ static final String ITEM_INDEX_PATH = RecommenderJob.class.getName() + ".itemIndex";
+
+ static final int DEFAULT_NUM_RECOMMENDATIONS = 10;
+
+ public static void main(String[] args) throws Exception {
+ ToolRunner.run(new RecommenderJob(), args);
+ }
+
+ @Override
+ public int run(String[] args) throws Exception {
+
+ addInputOption();
+ addOption("userFeatures", null, "path to the user feature matrix", true);
+ addOption("itemFeatures", null, "path to the item feature matrix", true);
+ addOption("numRecommendations", null, "number of recommendations per user",
+ String.valueOf(DEFAULT_NUM_RECOMMENDATIONS));
+ addOption("maxRating", null, "maximum rating available", true);
+ addOption("numThreads", null, "threads per mapper", String.valueOf(1));
+ addOption("usesLongIDs", null, "input contains long IDs that need to be translated");
+ addOption("userIDIndex", null, "index for user long IDs (necessary if usesLongIDs is true)");
+ addOption("itemIDIndex", null, "index for user long IDs (necessary if usesLongIDs is true)");
+ addOutputOption();
+
+ Map<String,List<String>> parsedArgs = parseArguments(args);
+ if (parsedArgs == null) {
+ return -1;
+ }
+
+ Job prediction = prepareJob(getInputPath(), getOutputPath(), SequenceFileInputFormat.class,
+ MultithreadedSharingMapper.class, IntWritable.class, RecommendedItemsWritable.class, TextOutputFormat.class);
+ Configuration conf = prediction.getConfiguration();
+
+ int numThreads = Integer.parseInt(getOption("numThreads"));
+
+ conf.setInt(NUM_RECOMMENDATIONS, Integer.parseInt(getOption("numRecommendations")));
+ conf.set(USER_FEATURES_PATH, getOption("userFeatures"));
+ conf.set(ITEM_FEATURES_PATH, getOption("itemFeatures"));
+ conf.set(MAX_RATING, getOption("maxRating"));
+
+ boolean usesLongIDs = Boolean.parseBoolean(getOption("usesLongIDs"));
+ if (usesLongIDs) {
+ conf.set(ParallelALSFactorizationJob.USES_LONG_IDS, String.valueOf(true));
+ conf.set(USER_INDEX_PATH, getOption("userIDIndex"));
+ conf.set(ITEM_INDEX_PATH, getOption("itemIDIndex"));
+ }
+
+ MultithreadedMapper.setMapperClass(prediction, PredictionMapper.class);
+ MultithreadedMapper.setNumberOfThreads(prediction, numThreads);
+
+ boolean succeeded = prediction.waitForCompletion(true);
+ if (!succeeded) {
+ return -1;
+ }
+
+ return 0;
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/SharingMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/SharingMapper.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/SharingMapper.java
new file mode 100644
index 0000000..9925807
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/SharingMapper.java
@@ -0,0 +1,59 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.als;
+
+import org.apache.hadoop.mapreduce.Mapper;
+
+import java.io.IOException;
+
+/**
+ * Mapper class to be used by {@link MultithreadedSharingMapper}. Offers "global" before() and after() methods
+ * that will typically be used to set up static variables.
+ *
+ * Suitable for mappers that need large, read-only in-memory data to operate.
+ *
+ * @param <K1>
+ * @param <V1>
+ * @param <K2>
+ * @param <V2>
+ */
+public abstract class SharingMapper<K1,V1,K2,V2,S> extends Mapper<K1,V1,K2,V2> {
+
+ private static Object SHARED_INSTANCE;
+
+ /**
+ * Called before the multithreaded execution
+ *
+ * @param context mapper's context
+ */
+ abstract S createSharedInstance(Context context) throws IOException;
+
+ final void setupSharedInstance(Context context) throws IOException {
+ if (SHARED_INSTANCE == null) {
+ SHARED_INSTANCE = createSharedInstance(context);
+ }
+ }
+
+ final S getSharedInstance() {
+ return (S) SHARED_INSTANCE;
+ }
+
+ static void reset() {
+ SHARED_INSTANCE = null;
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/SolveExplicitFeedbackMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/SolveExplicitFeedbackMapper.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/SolveExplicitFeedbackMapper.java
new file mode 100644
index 0000000..2569918
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/SolveExplicitFeedbackMapper.java
@@ -0,0 +1,61 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.als;
+
+import com.google.common.base.Preconditions;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+
+import org.apache.mahout.math.map.OpenIntObjectHashMap;
+
+import java.io.IOException;
+
+/** Solving mapper that can be safely executed using multiple threads */
+public class SolveExplicitFeedbackMapper
+ extends SharingMapper<IntWritable,VectorWritable,IntWritable,VectorWritable,OpenIntObjectHashMap<Vector>> {
+
+ private double lambda;
+ private int numFeatures;
+ private final VectorWritable uiOrmj = new VectorWritable();
+
+ @Override
+ OpenIntObjectHashMap<Vector> createSharedInstance(Context ctx) throws IOException {
+ Configuration conf = ctx.getConfiguration();
+ int numEntities = Integer.parseInt(conf.get(ParallelALSFactorizationJob.NUM_ENTITIES));
+ return ALS.readMatrixByRowsFromDistributedCache(numEntities, conf);
+ }
+
+ @Override
+ protected void setup(Mapper.Context ctx) throws IOException, InterruptedException {
+ lambda = Double.parseDouble(ctx.getConfiguration().get(ParallelALSFactorizationJob.LAMBDA));
+ numFeatures = ctx.getConfiguration().getInt(ParallelALSFactorizationJob.NUM_FEATURES, -1);
+ Preconditions.checkArgument(numFeatures > 0, "numFeatures must be greater then 0!");
+ }
+
+ @Override
+ protected void map(IntWritable userOrItemID, VectorWritable ratingsWritable, Context ctx)
+ throws IOException, InterruptedException {
+ OpenIntObjectHashMap<Vector> uOrM = getSharedInstance();
+ uiOrmj.set(ALS.solveExplicit(ratingsWritable, uOrM, lambda, numFeatures));
+ ctx.write(userOrItemID, uiOrmj);
+ }
+
+}
r***@apache.org
2018-06-28 14:54:51 UTC
Permalink
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/InvertedRunningAverage.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/InvertedRunningAverage.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/InvertedRunningAverage.java
new file mode 100644
index 0000000..0f94c22
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/InvertedRunningAverage.java
@@ -0,0 +1,58 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.common;
+
+public final class InvertedRunningAverage implements RunningAverage {
+
+ private final RunningAverage delegate;
+
+ public InvertedRunningAverage(RunningAverage delegate) {
+ this.delegate = delegate;
+ }
+
+ @Override
+ public void addDatum(double datum) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public void removeDatum(double datum) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public void changeDatum(double delta) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public int getCount() {
+ return delegate.getCount();
+ }
+
+ @Override
+ public double getAverage() {
+ return -delegate.getAverage();
+ }
+
+ @Override
+ public RunningAverage inverse() {
+ return delegate;
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/InvertedRunningAverageAndStdDev.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/InvertedRunningAverageAndStdDev.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/InvertedRunningAverageAndStdDev.java
new file mode 100644
index 0000000..147012d
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/InvertedRunningAverageAndStdDev.java
@@ -0,0 +1,63 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.common;
+
+public final class InvertedRunningAverageAndStdDev implements RunningAverageAndStdDev {
+
+ private final RunningAverageAndStdDev delegate;
+
+ public InvertedRunningAverageAndStdDev(RunningAverageAndStdDev delegate) {
+ this.delegate = delegate;
+ }
+
+ @Override
+ public void addDatum(double datum) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public void removeDatum(double datum) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public void changeDatum(double delta) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public int getCount() {
+ return delegate.getCount();
+ }
+
+ @Override
+ public double getAverage() {
+ return -delegate.getAverage();
+ }
+
+ @Override
+ public double getStandardDeviation() {
+ return delegate.getStandardDeviation();
+ }
+
+ @Override
+ public RunningAverageAndStdDev inverse() {
+ return delegate;
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/LongPrimitiveArrayIterator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/LongPrimitiveArrayIterator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/LongPrimitiveArrayIterator.java
new file mode 100644
index 0000000..5127df0
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/LongPrimitiveArrayIterator.java
@@ -0,0 +1,93 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.common;
+
+import java.util.NoSuchElementException;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * While long[] is an Iterable, it is not an Iterable&lt;Long&gt;. This adapter class addresses that.
+ */
+public final class LongPrimitiveArrayIterator implements LongPrimitiveIterator {
+
+ private final long[] array;
+ private int position;
+ private final int max;
+
+ /**
+ * <p>
+ * Creates an {@link LongPrimitiveArrayIterator} over an entire array.
+ * </p>
+ *
+ * @param array
+ * array to iterate over
+ */
+ public LongPrimitiveArrayIterator(long[] array) {
+ this.array = Preconditions.checkNotNull(array); // yeah, not going to copy the array here, for performance
+ this.position = 0;
+ this.max = array.length;
+ }
+
+ @Override
+ public boolean hasNext() {
+ return position < max;
+ }
+
+ @Override
+ public Long next() {
+ return nextLong();
+ }
+
+ @Override
+ public long nextLong() {
+ if (position >= array.length) {
+ throw new NoSuchElementException();
+ }
+ return array[position++];
+ }
+
+ @Override
+ public long peek() {
+ if (position >= array.length) {
+ throw new NoSuchElementException();
+ }
+ return array[position];
+ }
+
+ /**
+ * @throws UnsupportedOperationException
+ */
+ @Override
+ public void remove() {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public void skip(int n) {
+ if (n > 0) {
+ position += n;
+ }
+ }
+
+ @Override
+ public String toString() {
+ return "LongPrimitiveArrayIterator";
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/LongPrimitiveIterator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/LongPrimitiveIterator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/LongPrimitiveIterator.java
new file mode 100644
index 0000000..0840749
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/LongPrimitiveIterator.java
@@ -0,0 +1,39 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.common;
+
+/**
+ * Adds notion of iterating over {@code long} primitives in the style of an {@link java.util.Iterator} -- as
+ * opposed to iterating over {@link Long}. Implementations of this interface however also implement
+ * {@link java.util.Iterator} and {@link Iterable} over {@link Long} for convenience.
+ */
+public interface LongPrimitiveIterator extends SkippingIterator<Long> {
+
+ /**
+ * @return next {@code long} in iteration
+ * @throws java.util.NoSuchElementException
+ * if no more elements exist in the iteration
+ */
+ long nextLong();
+
+ /**
+ * @return next {@code long} in iteration without advancing iteration
+ */
+ long peek();
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/RefreshHelper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/RefreshHelper.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/RefreshHelper.java
new file mode 100644
index 0000000..3e03108
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/RefreshHelper.java
@@ -0,0 +1,122 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.common;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.List;
+import java.util.concurrent.Callable;
+import java.util.concurrent.locks.ReentrantLock;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * A helper class for implementing {@link Refreshable}. This object is typically included in an implementation
+ * {@link Refreshable} to implement {@link Refreshable#refresh(Collection)}. It execute the class's own
+ * supplied update logic, after updating all the object's dependencies. This also ensures that dependencies
+ * are not updated multiple times.
+ */
+public final class RefreshHelper implements Refreshable {
+
+ private static final Logger log = LoggerFactory.getLogger(RefreshHelper.class);
+
+ private final List<Refreshable> dependencies;
+ private final ReentrantLock refreshLock;
+ private final Callable<?> refreshRunnable;
+
+ /**
+ * @param refreshRunnable
+ * encapsulates the containing object's own refresh logic
+ */
+ public RefreshHelper(Callable<?> refreshRunnable) {
+ this.dependencies = new ArrayList<>(3);
+ this.refreshLock = new ReentrantLock();
+ this.refreshRunnable = refreshRunnable;
+ }
+
+ /** Add a dependency to be refreshed first when the encapsulating object does. */
+ public void addDependency(Refreshable refreshable) {
+ if (refreshable != null) {
+ dependencies.add(refreshable);
+ }
+ }
+
+ public void removeDependency(Refreshable refreshable) {
+ if (refreshable != null) {
+ dependencies.remove(refreshable);
+ }
+ }
+
+ /**
+ * Typically this is called in {@link Refreshable#refresh(java.util.Collection)} and is the entire body of
+ * that method.
+ */
+ @Override
+ public void refresh(Collection<Refreshable> alreadyRefreshed) {
+ if (refreshLock.tryLock()) {
+ try {
+ alreadyRefreshed = buildRefreshed(alreadyRefreshed);
+ for (Refreshable dependency : dependencies) {
+ maybeRefresh(alreadyRefreshed, dependency);
+ }
+ if (refreshRunnable != null) {
+ try {
+ refreshRunnable.call();
+ } catch (Exception e) {
+ log.warn("Unexpected exception while refreshing", e);
+ }
+ }
+ } finally {
+ refreshLock.unlock();
+ }
+ }
+ }
+
+ /**
+ * Creates a new and empty {@link Collection} if the method parameter is {@code null}.
+ *
+ * @param currentAlreadyRefreshed
+ * {@link Refreshable}s to refresh later on
+ * @return an empty {@link Collection} if the method param was {@code null} or the unmodified method
+ * param.
+ */
+ public static Collection<Refreshable> buildRefreshed(Collection<Refreshable> currentAlreadyRefreshed) {
+ return currentAlreadyRefreshed == null ? new HashSet<Refreshable>(3) : currentAlreadyRefreshed;
+ }
+
+ /**
+ * Adds the specified {@link Refreshable} to the given collection of {@link Refreshable}s if it is not
+ * already there and immediately refreshes it.
+ *
+ * @param alreadyRefreshed
+ * the collection of {@link Refreshable}s
+ * @param refreshable
+ * the {@link Refreshable} to potentially add and refresh
+ */
+ public static void maybeRefresh(Collection<Refreshable> alreadyRefreshed, Refreshable refreshable) {
+ if (!alreadyRefreshed.contains(refreshable)) {
+ alreadyRefreshed.add(refreshable);
+ log.info("Added refreshable: {}", refreshable);
+ refreshable.refresh(alreadyRefreshed);
+ log.info("Refreshed: {}", alreadyRefreshed);
+ }
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/Retriever.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/Retriever.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/Retriever.java
new file mode 100644
index 0000000..40da9de
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/Retriever.java
@@ -0,0 +1,36 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.common;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+
+/**
+ * <p>
+ * Implementations can retrieve a value for a given key.
+ * </p>
+ */
+public interface Retriever<K,V> {
+
+ /**
+ * @param key key for which a value should be retrieved
+ * @return value for key
+ * @throws TasteException if an error occurs while retrieving the value
+ */
+ V get(K key) throws TasteException;
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/RunningAverage.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/RunningAverage.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/RunningAverage.java
new file mode 100644
index 0000000..bf8e39c
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/RunningAverage.java
@@ -0,0 +1,67 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.common;
+
+/**
+ * <p>
+ * Interface for classes that can keep track of a running average of a series of numbers. One can add to or
+ * remove from the series, as well as update a datum in the series. The class does not actually keep track of
+ * the series of values, just its running average, so it doesn't even matter if you remove/change a value that
+ * wasn't added.
+ * </p>
+ */
+public interface RunningAverage {
+
+ /**
+ * @param datum
+ * new item to add to the running average
+ * @throws IllegalArgumentException
+ * if datum is {@link Double#NaN}
+ */
+ void addDatum(double datum);
+
+ /**
+ * @param datum
+ * item to remove to the running average
+ * @throws IllegalArgumentException
+ * if datum is {@link Double#NaN}
+ * @throws IllegalStateException
+ * if count is 0
+ */
+ void removeDatum(double datum);
+
+ /**
+ * @param delta
+ * amount by which to change a datum in the running average
+ * @throws IllegalArgumentException
+ * if delta is {@link Double#NaN}
+ * @throws IllegalStateException
+ * if count is 0
+ */
+ void changeDatum(double delta);
+
+ int getCount();
+
+ double getAverage();
+
+ /**
+ * @return a (possibly immutable) object whose average is the negative of this object's
+ */
+ RunningAverage inverse();
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/RunningAverageAndStdDev.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/RunningAverageAndStdDev.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/RunningAverageAndStdDev.java
new file mode 100644
index 0000000..4ac6108
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/RunningAverageAndStdDev.java
@@ -0,0 +1,36 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.common;
+
+/**
+ * <p>
+ * Extends {@link RunningAverage} by adding standard deviation too.
+ * </p>
+ */
+public interface RunningAverageAndStdDev extends RunningAverage {
+
+ /** @return standard deviation of data */
+ double getStandardDeviation();
+
+ /**
+ * @return a (possibly immutable) object whose average is the negative of this object's
+ */
+ @Override
+ RunningAverageAndStdDev inverse();
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/SamplingLongPrimitiveIterator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/SamplingLongPrimitiveIterator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/SamplingLongPrimitiveIterator.java
new file mode 100644
index 0000000..6da709d
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/SamplingLongPrimitiveIterator.java
@@ -0,0 +1,111 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.common;
+
+import java.util.NoSuchElementException;
+
+import com.google.common.base.Preconditions;
+import org.apache.commons.math3.distribution.PascalDistribution;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.common.RandomWrapper;
+
+/**
+ * Wraps a {@link LongPrimitiveIterator} and returns only some subset of the elements that it would,
+ * as determined by a sampling rate parameter.
+ */
+public final class SamplingLongPrimitiveIterator extends AbstractLongPrimitiveIterator {
+
+ private final PascalDistribution geometricDistribution;
+ private final LongPrimitiveIterator delegate;
+ private long next;
+ private boolean hasNext;
+
+ public SamplingLongPrimitiveIterator(LongPrimitiveIterator delegate, double samplingRate) {
+ this(RandomUtils.getRandom(), delegate, samplingRate);
+ }
+
+ public SamplingLongPrimitiveIterator(RandomWrapper random, LongPrimitiveIterator delegate, double samplingRate) {
+ Preconditions.checkNotNull(delegate);
+ Preconditions.checkArgument(samplingRate > 0.0 && samplingRate <= 1.0, "Must be: 0.0 < samplingRate <= 1.0");
+ // Geometric distribution is special case of negative binomial (aka Pascal) with r=1:
+ geometricDistribution = new PascalDistribution(random.getRandomGenerator(), 1, samplingRate);
+ this.delegate = delegate;
+ this.hasNext = true;
+ doNext();
+ }
+
+ @Override
+ public boolean hasNext() {
+ return hasNext;
+ }
+
+ @Override
+ public long nextLong() {
+ if (hasNext) {
+ long result = next;
+ doNext();
+ return result;
+ }
+ throw new NoSuchElementException();
+ }
+
+ @Override
+ public long peek() {
+ if (hasNext) {
+ return next;
+ }
+ throw new NoSuchElementException();
+ }
+
+ private void doNext() {
+ int toSkip = geometricDistribution.sample();
+ delegate.skip(toSkip);
+ if (delegate.hasNext()) {
+ next = delegate.next();
+ } else {
+ hasNext = false;
+ }
+ }
+
+ /**
+ * @throws UnsupportedOperationException
+ */
+ @Override
+ public void remove() {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public void skip(int n) {
+ int toSkip = 0;
+ for (int i = 0; i < n; i++) {
+ toSkip += geometricDistribution.sample();
+ }
+ delegate.skip(toSkip);
+ if (delegate.hasNext()) {
+ next = delegate.next();
+ } else {
+ hasNext = false;
+ }
+ }
+
+ public static LongPrimitiveIterator maybeWrapIterator(LongPrimitiveIterator delegate, double samplingRate) {
+ return samplingRate >= 1.0 ? delegate : new SamplingLongPrimitiveIterator(delegate, samplingRate);
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/SkippingIterator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/SkippingIterator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/SkippingIterator.java
new file mode 100644
index 0000000..e88f98a
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/SkippingIterator.java
@@ -0,0 +1,35 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.common;
+
+import java.util.Iterator;
+
+/**
+ * Adds ability to skip ahead in an iterator, perhaps more efficiently than by calling {@link #next()}
+ * repeatedly.
+ */
+public interface SkippingIterator<V> extends Iterator<V> {
+
+ /**
+ * Skip the next n elements supplied by this {@link Iterator}. If there are less than n elements remaining,
+ * this skips all remaining elements in the {@link Iterator}. This method has the same effect as calling
+ * {@link #next()} n times, except that it will never throw {@link java.util.NoSuchElementException}.
+ */
+ void skip(int n);
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/WeightedRunningAverage.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/WeightedRunningAverage.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/WeightedRunningAverage.java
new file mode 100644
index 0000000..76e5239
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/WeightedRunningAverage.java
@@ -0,0 +1,100 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.common;
+
+import java.io.Serializable;
+
+import com.google.common.base.Preconditions;
+
+public class WeightedRunningAverage implements RunningAverage, Serializable {
+
+ private double totalWeight;
+ private double average;
+
+ public WeightedRunningAverage() {
+ totalWeight = 0.0;
+ average = Double.NaN;
+ }
+
+ @Override
+ public synchronized void addDatum(double datum) {
+ addDatum(datum, 1.0);
+ }
+
+ public synchronized void addDatum(double datum, double weight) {
+ double oldTotalWeight = totalWeight;
+ totalWeight += weight;
+ if (oldTotalWeight <= 0.0) {
+ average = datum;
+ } else {
+ average = average * oldTotalWeight / totalWeight + datum * weight / totalWeight;
+ }
+ }
+
+ @Override
+ public synchronized void removeDatum(double datum) {
+ removeDatum(datum, 1.0);
+ }
+
+ public synchronized void removeDatum(double datum, double weight) {
+ double oldTotalWeight = totalWeight;
+ totalWeight -= weight;
+ if (totalWeight <= 0.0) {
+ average = Double.NaN;
+ totalWeight = 0.0;
+ } else {
+ average = average * oldTotalWeight / totalWeight - datum * weight / totalWeight;
+ }
+ }
+
+ @Override
+ public synchronized void changeDatum(double delta) {
+ changeDatum(delta, 1.0);
+ }
+
+ public synchronized void changeDatum(double delta, double weight) {
+ Preconditions.checkArgument(weight <= totalWeight, "weight must be <= totalWeight");
+ average += delta * weight / totalWeight;
+ }
+
+ public synchronized double getTotalWeight() {
+ return totalWeight;
+ }
+
+ /** @return {@link #getTotalWeight()} */
+ @Override
+ public synchronized int getCount() {
+ return (int) totalWeight;
+ }
+
+ @Override
+ public synchronized double getAverage() {
+ return average;
+ }
+
+ @Override
+ public RunningAverage inverse() {
+ return new InvertedRunningAverage(this);
+ }
+
+ @Override
+ public synchronized String toString() {
+ return String.valueOf(average);
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/WeightedRunningAverageAndStdDev.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/WeightedRunningAverageAndStdDev.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/WeightedRunningAverageAndStdDev.java
new file mode 100644
index 0000000..bed5812
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/WeightedRunningAverageAndStdDev.java
@@ -0,0 +1,89 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.common;
+
+/**
+ * This subclass also provides for a weighted estimate of the sample standard deviation.
+ * See <a href="http://en.wikipedia.org/wiki/Mean_square_weighted_deviation">estimate formulae here</a>.
+ */
+public final class WeightedRunningAverageAndStdDev extends WeightedRunningAverage implements RunningAverageAndStdDev {
+
+ private double totalSquaredWeight;
+ private double totalWeightedData;
+ private double totalWeightedSquaredData;
+
+ public WeightedRunningAverageAndStdDev() {
+ totalSquaredWeight = 0.0;
+ totalWeightedData = 0.0;
+ totalWeightedSquaredData = 0.0;
+ }
+
+ @Override
+ public synchronized void addDatum(double datum, double weight) {
+ super.addDatum(datum, weight);
+ totalSquaredWeight += weight * weight;
+ double weightedData = datum * weight;
+ totalWeightedData += weightedData;
+ totalWeightedSquaredData += weightedData * datum;
+ }
+
+ @Override
+ public synchronized void removeDatum(double datum, double weight) {
+ super.removeDatum(datum, weight);
+ totalSquaredWeight -= weight * weight;
+ if (totalSquaredWeight <= 0.0) {
+ totalSquaredWeight = 0.0;
+ }
+ double weightedData = datum * weight;
+ totalWeightedData -= weightedData;
+ if (totalWeightedData <= 0.0) {
+ totalWeightedData = 0.0;
+ }
+ totalWeightedSquaredData -= weightedData * datum;
+ if (totalWeightedSquaredData <= 0.0) {
+ totalWeightedSquaredData = 0.0;
+ }
+ }
+
+ /**
+ * @throws UnsupportedOperationException
+ */
+ @Override
+ public synchronized void changeDatum(double delta, double weight) {
+ throw new UnsupportedOperationException();
+ }
+
+
+ @Override
+ public synchronized double getStandardDeviation() {
+ double totalWeight = getTotalWeight();
+ return Math.sqrt((totalWeightedSquaredData * totalWeight - totalWeightedData * totalWeightedData)
+ / (totalWeight * totalWeight - totalSquaredWeight));
+ }
+
+ @Override
+ public RunningAverageAndStdDev inverse() {
+ return new InvertedRunningAverageAndStdDev(this);
+ }
+
+ @Override
+ public synchronized String toString() {
+ return String.valueOf(String.valueOf(getAverage()) + ',' + getStandardDeviation());
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/jdbc/AbstractJDBCComponent.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/jdbc/AbstractJDBCComponent.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/jdbc/AbstractJDBCComponent.java
new file mode 100644
index 0000000..d1e93ab
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/jdbc/AbstractJDBCComponent.java
@@ -0,0 +1,88 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.common.jdbc;
+
+import javax.naming.Context;
+import javax.naming.InitialContext;
+import javax.naming.NamingException;
+import javax.sql.DataSource;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * A helper class with common elements for several JDBC-related components.
+ */
+public abstract class AbstractJDBCComponent {
+
+ private static final Logger log = LoggerFactory.getLogger(AbstractJDBCComponent.class);
+
+ private static final int DEFAULT_FETCH_SIZE = 1000; // A max, "big" number of rows to buffer at once
+ protected static final String DEFAULT_DATASOURCE_NAME = "jdbc/taste";
+
+ protected static void checkNotNullAndLog(String argName, Object value) {
+ Preconditions.checkArgument(value != null && !value.toString().isEmpty(),
+ argName + " is null or empty");
+ log.debug("{}: {}", argName, value);
+ }
+
+ protected static void checkNotNullAndLog(String argName, Object[] values) {
+ Preconditions.checkArgument(values != null && values.length != 0, argName + " is null or zero-length");
+ for (Object value : values) {
+ checkNotNullAndLog(argName, value);
+ }
+ }
+
+ /**
+ * <p>
+ * Looks up a {@link DataSource} by name from JNDI. "java:comp/env/" is prepended to the argument before
+ * looking up the name in JNDI.
+ * </p>
+ *
+ * @param dataSourceName
+ * JNDI name where a {@link DataSource} is bound (e.g. "jdbc/taste")
+ * @return {@link DataSource} under that JNDI name
+ * @throws TasteException
+ * if a JNDI error occurs
+ */
+ public static DataSource lookupDataSource(String dataSourceName) throws TasteException {
+ Context context = null;
+ try {
+ context = new InitialContext();
+ return (DataSource) context.lookup("java:comp/env/" + dataSourceName);
+ } catch (NamingException ne) {
+ throw new TasteException(ne);
+ } finally {
+ if (context != null) {
+ try {
+ context.close();
+ } catch (NamingException ne) {
+ log.warn("Error while closing Context; continuing...", ne);
+ }
+ }
+ }
+ }
+
+ protected int getFetchSize() {
+ return DEFAULT_FETCH_SIZE;
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/jdbc/EachRowIterator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/jdbc/EachRowIterator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/jdbc/EachRowIterator.java
new file mode 100644
index 0000000..3f024bc
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/jdbc/EachRowIterator.java
@@ -0,0 +1,92 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.common.jdbc;
+
+import javax.sql.DataSource;
+import java.io.Closeable;
+import java.sql.Connection;
+import java.sql.PreparedStatement;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+
+import com.google.common.collect.AbstractIterator;
+import org.apache.mahout.common.IOUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Provides an {@link java.util.Iterator} over the result of an SQL query, as an iteration over the {@link ResultSet}.
+ * While the same object will be returned from the iteration each time, it will be returned once for each row
+ * of the result.
+ */
+final class EachRowIterator extends AbstractIterator<ResultSet> implements Closeable {
+
+ private static final Logger log = LoggerFactory.getLogger(EachRowIterator.class);
+
+ private final Connection connection;
+ private final PreparedStatement statement;
+ private final ResultSet resultSet;
+
+ EachRowIterator(DataSource dataSource, String sqlQuery) throws SQLException {
+ try {
+ connection = dataSource.getConnection();
+ statement = connection.prepareStatement(sqlQuery, ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY);
+ statement.setFetchDirection(ResultSet.FETCH_FORWARD);
+ //statement.setFetchSize(getFetchSize());
+ log.debug("Executing SQL query: {}", sqlQuery);
+ resultSet = statement.executeQuery();
+ } catch (SQLException sqle) {
+ close();
+ throw sqle;
+ }
+ }
+
+ @Override
+ protected ResultSet computeNext() {
+ try {
+ if (resultSet.next()) {
+ return resultSet;
+ } else {
+ close();
+ return null;
+ }
+ } catch (SQLException sqle) {
+ close();
+ throw new IllegalStateException(sqle);
+ }
+ }
+
+ public void skip(int n) throws SQLException {
+ try {
+ resultSet.relative(n);
+ } catch (SQLException sqle) {
+ // Can't use relative on MySQL Connector/J; try advancing manually
+ int i = 0;
+ while (i < n && resultSet.next()) {
+ i++;
+ }
+ }
+ }
+
+ @Override
+ public void close() {
+ IOUtils.quietClose(resultSet, statement, connection);
+ endOfData();
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/jdbc/ResultSetIterator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/jdbc/ResultSetIterator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/jdbc/ResultSetIterator.java
new file mode 100644
index 0000000..273ebd5
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/jdbc/ResultSetIterator.java
@@ -0,0 +1,66 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.common.jdbc;
+
+import javax.sql.DataSource;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.util.Iterator;
+
+import com.google.common.base.Function;
+import com.google.common.collect.ForwardingIterator;
+import com.google.common.collect.Iterators;
+
+public abstract class ResultSetIterator<T> extends ForwardingIterator<T> {
+
+ private final Iterator<T> delegate;
+ private final EachRowIterator rowDelegate;
+
+ protected ResultSetIterator(DataSource dataSource, String sqlQuery) throws SQLException {
+ this.rowDelegate = new EachRowIterator(dataSource, sqlQuery);
+ delegate = Iterators.transform(rowDelegate,
+ new Function<ResultSet, T>() {
+ @Override
+ public T apply(ResultSet from) {
+ try {
+ return parseElement(from);
+ } catch (SQLException sqle) {
+ throw new IllegalStateException(sqle);
+ }
+ }
+ });
+ }
+
+ @Override
+ protected Iterator<T> delegate() {
+ return delegate;
+ }
+
+ protected abstract T parseElement(ResultSet resultSet) throws SQLException;
+
+ public void skip(int n) {
+ if (n >= 1) {
+ try {
+ rowDelegate.skip(n);
+ } catch (SQLException sqle) {
+ throw new IllegalStateException(sqle);
+ }
+ }
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/AbstractDifferenceRecommenderEvaluator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/AbstractDifferenceRecommenderEvaluator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/AbstractDifferenceRecommenderEvaluator.java
new file mode 100644
index 0000000..f926f18
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/AbstractDifferenceRecommenderEvaluator.java
@@ -0,0 +1,276 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.eval;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import com.google.common.base.Preconditions;
+import org.apache.mahout.cf.taste.common.NoSuchItemException;
+import org.apache.mahout.cf.taste.common.NoSuchUserException;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.eval.DataModelBuilder;
+import org.apache.mahout.cf.taste.eval.RecommenderBuilder;
+import org.apache.mahout.cf.taste.eval.RecommenderEvaluator;
+import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
+import org.apache.mahout.cf.taste.impl.common.FullRunningAverageAndStdDev;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+import org.apache.mahout.cf.taste.impl.common.RunningAverageAndStdDev;
+import org.apache.mahout.cf.taste.impl.model.GenericDataModel;
+import org.apache.mahout.cf.taste.impl.model.GenericPreference;
+import org.apache.mahout.cf.taste.impl.model.GenericUserPreferenceArray;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.Preference;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.cf.taste.recommender.Recommender;
+import org.apache.mahout.common.RandomUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Abstract superclass of a couple implementations, providing shared functionality.
+ */
+public abstract class AbstractDifferenceRecommenderEvaluator implements RecommenderEvaluator {
+
+ private static final Logger log = LoggerFactory.getLogger(AbstractDifferenceRecommenderEvaluator.class);
+
+ private final Random random;
+ private float maxPreference;
+ private float minPreference;
+
+ protected AbstractDifferenceRecommenderEvaluator() {
+ random = RandomUtils.getRandom();
+ maxPreference = Float.NaN;
+ minPreference = Float.NaN;
+ }
+
+ @Override
+ public final float getMaxPreference() {
+ return maxPreference;
+ }
+
+ @Override
+ public final void setMaxPreference(float maxPreference) {
+ this.maxPreference = maxPreference;
+ }
+
+ @Override
+ public final float getMinPreference() {
+ return minPreference;
+ }
+
+ @Override
+ public final void setMinPreference(float minPreference) {
+ this.minPreference = minPreference;
+ }
+
+ @Override
+ public double evaluate(RecommenderBuilder recommenderBuilder,
+ DataModelBuilder dataModelBuilder,
+ DataModel dataModel,
+ double trainingPercentage,
+ double evaluationPercentage) throws TasteException {
+ Preconditions.checkNotNull(recommenderBuilder);
+ Preconditions.checkNotNull(dataModel);
+ Preconditions.checkArgument(trainingPercentage >= 0.0 && trainingPercentage <= 1.0,
+ "Invalid trainingPercentage: " + trainingPercentage + ". Must be: 0.0 <= trainingPercentage <= 1.0");
+ Preconditions.checkArgument(evaluationPercentage >= 0.0 && evaluationPercentage <= 1.0,
+ "Invalid evaluationPercentage: " + evaluationPercentage + ". Must be: 0.0 <= evaluationPercentage <= 1.0");
+
+ log.info("Beginning evaluation using {} of {}", trainingPercentage, dataModel);
+
+ int numUsers = dataModel.getNumUsers();
+ FastByIDMap<PreferenceArray> trainingPrefs = new FastByIDMap<>(
+ 1 + (int) (evaluationPercentage * numUsers));
+ FastByIDMap<PreferenceArray> testPrefs = new FastByIDMap<>(
+ 1 + (int) (evaluationPercentage * numUsers));
+
+ LongPrimitiveIterator it = dataModel.getUserIDs();
+ while (it.hasNext()) {
+ long userID = it.nextLong();
+ if (random.nextDouble() < evaluationPercentage) {
+ splitOneUsersPrefs(trainingPercentage, trainingPrefs, testPrefs, userID, dataModel);
+ }
+ }
+
+ DataModel trainingModel = dataModelBuilder == null ? new GenericDataModel(trainingPrefs)
+ : dataModelBuilder.buildDataModel(trainingPrefs);
+
+ Recommender recommender = recommenderBuilder.buildRecommender(trainingModel);
+
+ double result = getEvaluation(testPrefs, recommender);
+ log.info("Evaluation result: {}", result);
+ return result;
+ }
+
+ private void splitOneUsersPrefs(double trainingPercentage,
+ FastByIDMap<PreferenceArray> trainingPrefs,
+ FastByIDMap<PreferenceArray> testPrefs,
+ long userID,
+ DataModel dataModel) throws TasteException {
+ List<Preference> oneUserTrainingPrefs = null;
+ List<Preference> oneUserTestPrefs = null;
+ PreferenceArray prefs = dataModel.getPreferencesFromUser(userID);
+ int size = prefs.length();
+ for (int i = 0; i < size; i++) {
+ Preference newPref = new GenericPreference(userID, prefs.getItemID(i), prefs.getValue(i));
+ if (random.nextDouble() < trainingPercentage) {
+ if (oneUserTrainingPrefs == null) {
+ oneUserTrainingPrefs = new ArrayList<>(3);
+ }
+ oneUserTrainingPrefs.add(newPref);
+ } else {
+ if (oneUserTestPrefs == null) {
+ oneUserTestPrefs = new ArrayList<>(3);
+ }
+ oneUserTestPrefs.add(newPref);
+ }
+ }
+ if (oneUserTrainingPrefs != null) {
+ trainingPrefs.put(userID, new GenericUserPreferenceArray(oneUserTrainingPrefs));
+ if (oneUserTestPrefs != null) {
+ testPrefs.put(userID, new GenericUserPreferenceArray(oneUserTestPrefs));
+ }
+ }
+ }
+
+ private float capEstimatedPreference(float estimate) {
+ if (estimate > maxPreference) {
+ return maxPreference;
+ }
+ if (estimate < minPreference) {
+ return minPreference;
+ }
+ return estimate;
+ }
+
+ private double getEvaluation(FastByIDMap<PreferenceArray> testPrefs, Recommender recommender)
+ throws TasteException {
+ reset();
+ Collection<Callable<Void>> estimateCallables = new ArrayList<>();
+ AtomicInteger noEstimateCounter = new AtomicInteger();
+ for (Map.Entry<Long,PreferenceArray> entry : testPrefs.entrySet()) {
+ estimateCallables.add(
+ new PreferenceEstimateCallable(recommender, entry.getKey(), entry.getValue(), noEstimateCounter));
+ }
+ log.info("Beginning evaluation of {} users", estimateCallables.size());
+ RunningAverageAndStdDev timing = new FullRunningAverageAndStdDev();
+ execute(estimateCallables, noEstimateCounter, timing);
+ return computeFinalEvaluation();
+ }
+
+ protected static void execute(Collection<Callable<Void>> callables,
+ AtomicInteger noEstimateCounter,
+ RunningAverageAndStdDev timing) throws TasteException {
+
+ Collection<Callable<Void>> wrappedCallables = wrapWithStatsCallables(callables, noEstimateCounter, timing);
+ int numProcessors = Runtime.getRuntime().availableProcessors();
+ ExecutorService executor = Executors.newFixedThreadPool(numProcessors);
+ log.info("Starting timing of {} tasks in {} threads", wrappedCallables.size(), numProcessors);
+ try {
+ List<Future<Void>> futures = executor.invokeAll(wrappedCallables);
+ // Go look for exceptions here, really
+ for (Future<Void> future : futures) {
+ future.get();
+ }
+
+ } catch (InterruptedException ie) {
+ throw new TasteException(ie);
+ } catch (ExecutionException ee) {
+ throw new TasteException(ee.getCause());
+ }
+
+ executor.shutdown();
+ try {
+ executor.awaitTermination(10, TimeUnit.SECONDS);
+ } catch (InterruptedException e) {
+ throw new TasteException(e.getCause());
+ }
+ }
+
+ private static Collection<Callable<Void>> wrapWithStatsCallables(Iterable<Callable<Void>> callables,
+ AtomicInteger noEstimateCounter,
+ RunningAverageAndStdDev timing) {
+ Collection<Callable<Void>> wrapped = new ArrayList<>();
+ int count = 0;
+ for (Callable<Void> callable : callables) {
+ boolean logStats = count++ % 1000 == 0; // log every 1000 or so iterations
+ wrapped.add(new StatsCallable(callable, logStats, timing, noEstimateCounter));
+ }
+ return wrapped;
+ }
+
+ protected abstract void reset();
+
+ protected abstract void processOneEstimate(float estimatedPreference, Preference realPref);
+
+ protected abstract double computeFinalEvaluation();
+
+ public final class PreferenceEstimateCallable implements Callable<Void> {
+
+ private final Recommender recommender;
+ private final long testUserID;
+ private final PreferenceArray prefs;
+ private final AtomicInteger noEstimateCounter;
+
+ public PreferenceEstimateCallable(Recommender recommender,
+ long testUserID,
+ PreferenceArray prefs,
+ AtomicInteger noEstimateCounter) {
+ this.recommender = recommender;
+ this.testUserID = testUserID;
+ this.prefs = prefs;
+ this.noEstimateCounter = noEstimateCounter;
+ }
+
+ @Override
+ public Void call() throws TasteException {
+ for (Preference realPref : prefs) {
+ float estimatedPreference = Float.NaN;
+ try {
+ estimatedPreference = recommender.estimatePreference(testUserID, realPref.getItemID());
+ } catch (NoSuchUserException nsue) {
+ // It's possible that an item exists in the test data but not training data in which case
+ // NSEE will be thrown. Just ignore it and move on.
+ log.info("User exists in test data but not training data: {}", testUserID);
+ } catch (NoSuchItemException nsie) {
+ log.info("Item exists in test data but not training data: {}", realPref.getItemID());
+ }
+ if (Float.isNaN(estimatedPreference)) {
+ noEstimateCounter.incrementAndGet();
+ } else {
+ estimatedPreference = capEstimatedPreference(estimatedPreference);
+ processOneEstimate(estimatedPreference, realPref);
+ }
+ }
+ return null;
+ }
+
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/AverageAbsoluteDifferenceRecommenderEvaluator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/AverageAbsoluteDifferenceRecommenderEvaluator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/AverageAbsoluteDifferenceRecommenderEvaluator.java
new file mode 100644
index 0000000..4dad040
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/AverageAbsoluteDifferenceRecommenderEvaluator.java
@@ -0,0 +1,59 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.eval;
+
+import org.apache.mahout.cf.taste.impl.common.FullRunningAverage;
+import org.apache.mahout.cf.taste.impl.common.RunningAverage;
+import org.apache.mahout.cf.taste.model.Preference;
+
+/**
+ * <p>
+ * A {@link org.apache.mahout.cf.taste.eval.RecommenderEvaluator} which computes the average absolute
+ * difference between predicted and actual ratings for users.
+ * </p>
+ *
+ * <p>
+ * This algorithm is also called "mean average error".
+ * </p>
+ */
+public final class AverageAbsoluteDifferenceRecommenderEvaluator extends
+ AbstractDifferenceRecommenderEvaluator {
+
+ private RunningAverage average;
+
+ @Override
+ protected void reset() {
+ average = new FullRunningAverage();
+ }
+
+ @Override
+ protected void processOneEstimate(float estimatedPreference, Preference realPref) {
+ average.addDatum(Math.abs(realPref.getValue() - estimatedPreference));
+ }
+
+ @Override
+ protected double computeFinalEvaluation() {
+ return average.getAverage();
+ }
+
+ @Override
+ public String toString() {
+ return "AverageAbsoluteDifferenceRecommenderEvaluator";
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/GenericRecommenderIRStatsEvaluator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/GenericRecommenderIRStatsEvaluator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/GenericRecommenderIRStatsEvaluator.java
new file mode 100644
index 0000000..0e121d1
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/GenericRecommenderIRStatsEvaluator.java
@@ -0,0 +1,237 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.eval;
+
+import java.util.List;
+import java.util.Random;
+
+import org.apache.mahout.cf.taste.common.NoSuchUserException;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.eval.DataModelBuilder;
+import org.apache.mahout.cf.taste.eval.IRStatistics;
+import org.apache.mahout.cf.taste.eval.RecommenderBuilder;
+import org.apache.mahout.cf.taste.eval.RecommenderIRStatsEvaluator;
+import org.apache.mahout.cf.taste.eval.RelevantItemsDataSplitter;
+import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.impl.common.FullRunningAverage;
+import org.apache.mahout.cf.taste.impl.common.FullRunningAverageAndStdDev;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+import org.apache.mahout.cf.taste.impl.common.RunningAverage;
+import org.apache.mahout.cf.taste.impl.common.RunningAverageAndStdDev;
+import org.apache.mahout.cf.taste.impl.model.GenericDataModel;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.cf.taste.recommender.IDRescorer;
+import org.apache.mahout.cf.taste.recommender.RecommendedItem;
+import org.apache.mahout.cf.taste.recommender.Recommender;
+import org.apache.mahout.common.RandomUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * <p>
+ * For each user, these implementation determine the top {@code n} preferences, then evaluate the IR
+ * statistics based on a {@link DataModel} that does not have these values. This number {@code n} is the
+ * "at" value, as in "precision at 5". For example, this would mean precision evaluated by removing the top 5
+ * preferences for a user and then finding the percentage of those 5 items included in the top 5
+ * recommendations for that user.
+ * </p>
+ */
+public final class GenericRecommenderIRStatsEvaluator implements RecommenderIRStatsEvaluator {
+
+ private static final Logger log = LoggerFactory.getLogger(GenericRecommenderIRStatsEvaluator.class);
+
+ private static final double LOG2 = Math.log(2.0);
+
+ /**
+ * Pass as "relevanceThreshold" argument to
+ * {@link #evaluate(RecommenderBuilder, DataModelBuilder, DataModel, IDRescorer, int, double, double)} to
+ * have it attempt to compute a reasonable threshold. Note that this will impact performance.
+ */
+ public static final double CHOOSE_THRESHOLD = Double.NaN;
+
+ private final Random random;
+ private final RelevantItemsDataSplitter dataSplitter;
+
+ public GenericRecommenderIRStatsEvaluator() {
+ this(new GenericRelevantItemsDataSplitter());
+ }
+
+ public GenericRecommenderIRStatsEvaluator(RelevantItemsDataSplitter dataSplitter) {
+ Preconditions.checkNotNull(dataSplitter);
+ random = RandomUtils.getRandom();
+ this.dataSplitter = dataSplitter;
+ }
+
+ @Override
+ public IRStatistics evaluate(RecommenderBuilder recommenderBuilder,
+ DataModelBuilder dataModelBuilder,
+ DataModel dataModel,
+ IDRescorer rescorer,
+ int at,
+ double relevanceThreshold,
+ double evaluationPercentage) throws TasteException {
+
+ Preconditions.checkArgument(recommenderBuilder != null, "recommenderBuilder is null");
+ Preconditions.checkArgument(dataModel != null, "dataModel is null");
+ Preconditions.checkArgument(at >= 1, "at must be at least 1");
+ Preconditions.checkArgument(evaluationPercentage > 0.0 && evaluationPercentage <= 1.0,
+ "Invalid evaluationPercentage: " + evaluationPercentage + ". Must be: 0.0 < evaluationPercentage <= 1.0");
+
+ int numItems = dataModel.getNumItems();
+ RunningAverage precision = new FullRunningAverage();
+ RunningAverage recall = new FullRunningAverage();
+ RunningAverage fallOut = new FullRunningAverage();
+ RunningAverage nDCG = new FullRunningAverage();
+ int numUsersRecommendedFor = 0;
+ int numUsersWithRecommendations = 0;
+
+ LongPrimitiveIterator it = dataModel.getUserIDs();
+ while (it.hasNext()) {
+
+ long userID = it.nextLong();
+
+ if (random.nextDouble() >= evaluationPercentage) {
+ // Skipped
+ continue;
+ }
+
+ long start = System.currentTimeMillis();
+
+ PreferenceArray prefs = dataModel.getPreferencesFromUser(userID);
+
+ // List some most-preferred items that would count as (most) "relevant" results
+ double theRelevanceThreshold = Double.isNaN(relevanceThreshold) ? computeThreshold(prefs) : relevanceThreshold;
+ FastIDSet relevantItemIDs = dataSplitter.getRelevantItemsIDs(userID, at, theRelevanceThreshold, dataModel);
+
+ int numRelevantItems = relevantItemIDs.size();
+ if (numRelevantItems <= 0) {
+ continue;
+ }
+
+ FastByIDMap<PreferenceArray> trainingUsers = new FastByIDMap<>(dataModel.getNumUsers());
+ LongPrimitiveIterator it2 = dataModel.getUserIDs();
+ while (it2.hasNext()) {
+ dataSplitter.processOtherUser(userID, relevantItemIDs, trainingUsers, it2.nextLong(), dataModel);
+ }
+
+ DataModel trainingModel = dataModelBuilder == null ? new GenericDataModel(trainingUsers)
+ : dataModelBuilder.buildDataModel(trainingUsers);
+ try {
+ trainingModel.getPreferencesFromUser(userID);
+ } catch (NoSuchUserException nsee) {
+ continue; // Oops we excluded all prefs for the user -- just move on
+ }
+
+ int size = numRelevantItems + trainingModel.getItemIDsFromUser(userID).size();
+ if (size < 2 * at) {
+ // Really not enough prefs to meaningfully evaluate this user
+ continue;
+ }
+
+ Recommender recommender = recommenderBuilder.buildRecommender(trainingModel);
+
+ int intersectionSize = 0;
+ List<RecommendedItem> recommendedItems = recommender.recommend(userID, at, rescorer);
+ for (RecommendedItem recommendedItem : recommendedItems) {
+ if (relevantItemIDs.contains(recommendedItem.getItemID())) {
+ intersectionSize++;
+ }
+ }
+
+ int numRecommendedItems = recommendedItems.size();
+
+ // Precision
+ if (numRecommendedItems > 0) {
+ precision.addDatum((double) intersectionSize / (double) numRecommendedItems);
+ }
+
+ // Recall
+ recall.addDatum((double) intersectionSize / (double) numRelevantItems);
+
+ // Fall-out
+ if (numRelevantItems < size) {
+ fallOut.addDatum((double) (numRecommendedItems - intersectionSize)
+ / (double) (numItems - numRelevantItems));
+ }
+
+ // nDCG
+ // In computing, assume relevant IDs have relevance 1 and others 0
+ double cumulativeGain = 0.0;
+ double idealizedGain = 0.0;
+ for (int i = 0; i < numRecommendedItems; i++) {
+ RecommendedItem item = recommendedItems.get(i);
+ double discount = 1.0 / log2(i + 2.0); // Classical formulation says log(i+1), but i is 0-based here
+ if (relevantItemIDs.contains(item.getItemID())) {
+ cumulativeGain += discount;
+ }
+ // otherwise we're multiplying discount by relevance 0 so it doesn't do anything
+
+ // Ideally results would be ordered with all relevant ones first, so this theoretical
+ // ideal list starts with number of relevant items equal to the total number of relevant items
+ if (i < numRelevantItems) {
+ idealizedGain += discount;
+ }
+ }
+ if (idealizedGain > 0.0) {
+ nDCG.addDatum(cumulativeGain / idealizedGain);
+ }
+
+ // Reach
+ numUsersRecommendedFor++;
+ if (numRecommendedItems > 0) {
+ numUsersWithRecommendations++;
+ }
+
+ long end = System.currentTimeMillis();
+
+ log.info("Evaluated with user {} in {}ms", userID, end - start);
+ log.info("Precision/recall/fall-out/nDCG/reach: {} / {} / {} / {} / {}",
+ precision.getAverage(), recall.getAverage(), fallOut.getAverage(), nDCG.getAverage(),
+ (double) numUsersWithRecommendations / (double) numUsersRecommendedFor);
+ }
+
+ return new IRStatisticsImpl(
+ precision.getAverage(),
+ recall.getAverage(),
+ fallOut.getAverage(),
+ nDCG.getAverage(),
+ (double) numUsersWithRecommendations / (double) numUsersRecommendedFor);
+ }
+
+ private static double computeThreshold(PreferenceArray prefs) {
+ if (prefs.length() < 2) {
+ // Not enough data points -- return a threshold that allows everything
+ return Double.NEGATIVE_INFINITY;
+ }
+ RunningAverageAndStdDev stdDev = new FullRunningAverageAndStdDev();
+ int size = prefs.length();
+ for (int i = 0; i < size; i++) {
+ stdDev.addDatum(prefs.getValue(i));
+ }
+ return stdDev.getAverage() + stdDev.getStandardDeviation();
+ }
+
+ private static double log2(double value) {
+ return Math.log(value) / LOG2;
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/GenericRelevantItemsDataSplitter.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/GenericRelevantItemsDataSplitter.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/GenericRelevantItemsDataSplitter.java
new file mode 100644
index 0000000..f4e4522
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/GenericRelevantItemsDataSplitter.java
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.eval;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.eval.RelevantItemsDataSplitter;
+import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.impl.model.GenericUserPreferenceArray;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.Preference;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+/**
+ * Picks relevant items to be those with the strongest preference, and
+ * includes the other users' preferences in full.
+ */
+public final class GenericRelevantItemsDataSplitter implements RelevantItemsDataSplitter {
+
+ @Override
+ public FastIDSet getRelevantItemsIDs(long userID,
+ int at,
+ double relevanceThreshold,
+ DataModel dataModel) throws TasteException {
+ PreferenceArray prefs = dataModel.getPreferencesFromUser(userID);
+ FastIDSet relevantItemIDs = new FastIDSet(at);
+ prefs.sortByValueReversed();
+ for (int i = 0; i < prefs.length() && relevantItemIDs.size() < at; i++) {
+ if (prefs.getValue(i) >= relevanceThreshold) {
+ relevantItemIDs.add(prefs.getItemID(i));
+ }
+ }
+ return relevantItemIDs;
+ }
+
+ @Override
+ public void processOtherUser(long userID,
+ FastIDSet relevantItemIDs,
+ FastByIDMap<PreferenceArray> trainingUsers,
+ long otherUserID,
+ DataModel dataModel) throws TasteException {
+ PreferenceArray prefs2Array = dataModel.getPreferencesFromUser(otherUserID);
+ // If we're dealing with the very user that we're evaluating for precision/recall,
+ if (userID == otherUserID) {
+ // then must remove all the test IDs, the "relevant" item IDs
+ List<Preference> prefs2 = new ArrayList<>(prefs2Array.length());
+ for (Preference pref : prefs2Array) {
+ prefs2.add(pref);
+ }
+ for (Iterator<Preference> iterator = prefs2.iterator(); iterator.hasNext();) {
+ Preference pref = iterator.next();
+ if (relevantItemIDs.contains(pref.getItemID())) {
+ iterator.remove();
+ }
+ }
+ if (!prefs2.isEmpty()) {
+ trainingUsers.put(otherUserID, new GenericUserPreferenceArray(prefs2));
+ }
+ } else {
+ // otherwise just add all those other user's prefs
+ trainingUsers.put(otherUserID, prefs2Array);
+ }
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/IRStatisticsImpl.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/IRStatisticsImpl.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/IRStatisticsImpl.java
new file mode 100644
index 0000000..2838b08
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/IRStatisticsImpl.java
@@ -0,0 +1,95 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.eval;
+
+import java.io.Serializable;
+
+import org.apache.mahout.cf.taste.eval.IRStatistics;
+
+import com.google.common.base.Preconditions;
+
+public final class IRStatisticsImpl implements IRStatistics, Serializable {
+
+ private final double precision;
+ private final double recall;
+ private final double fallOut;
+ private final double ndcg;
+ private final double reach;
+
+ IRStatisticsImpl(double precision, double recall, double fallOut, double ndcg, double reach) {
+ Preconditions.checkArgument(Double.isNaN(precision) || (precision >= 0.0 && precision <= 1.0),
+ "Illegal precision: " + precision + ". Must be: 0.0 <= precision <= 1.0 or NaN");
+ Preconditions.checkArgument(Double.isNaN(recall) || (recall >= 0.0 && recall <= 1.0),
+ "Illegal recall: " + recall + ". Must be: 0.0 <= recall <= 1.0 or NaN");
+ Preconditions.checkArgument(Double.isNaN(fallOut) || (fallOut >= 0.0 && fallOut <= 1.0),
+ "Illegal fallOut: " + fallOut + ". Must be: 0.0 <= fallOut <= 1.0 or NaN");
+ Preconditions.checkArgument(Double.isNaN(ndcg) || (ndcg >= 0.0 && ndcg <= 1.0),
+ "Illegal nDCG: " + ndcg + ". Must be: 0.0 <= nDCG <= 1.0 or NaN");
+ Preconditions.checkArgument(Double.isNaN(reach) || (reach >= 0.0 && reach <= 1.0),
+ "Illegal reach: " + reach + ". Must be: 0.0 <= reach <= 1.0 or NaN");
+ this.precision = precision;
+ this.recall = recall;
+ this.fallOut = fallOut;
+ this.ndcg = ndcg;
+ this.reach = reach;
+ }
+
+ @Override
+ public double getPrecision() {
+ return precision;
+ }
+
+ @Override
+ public double getRecall() {
+ return recall;
+ }
+
+ @Override
+ public double getFallOut() {
+ return fallOut;
+ }
+
+ @Override
+ public double getF1Measure() {
+ return getFNMeasure(1.0);
+ }
+
+ @Override
+ public double getFNMeasure(double b) {
+ double b2 = b * b;
+ double sum = b2 * precision + recall;
+ return sum == 0.0 ? Double.NaN : (1.0 + b2) * precision * recall / sum;
+ }
+
+ @Override
+ public double getNormalizedDiscountedCumulativeGain() {
+ return ndcg;
+ }
+
+ @Override
+ public double getReach() {
+ return reach;
+ }
+
+ @Override
+ public String toString() {
+ return "IRStatisticsImpl[precision:" + precision + ",recall:" + recall + ",fallOut:"
+ + fallOut + ",nDCG:" + ndcg + ",reach:" + reach + ']';
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/LoadCallable.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/LoadCallable.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/LoadCallable.java
new file mode 100644
index 0000000..213f7f9
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/LoadCallable.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.eval;
+
+import org.apache.mahout.cf.taste.recommender.Recommender;
+
+import java.util.concurrent.Callable;
+
+final class LoadCallable implements Callable<Void> {
+
+ private final Recommender recommender;
+ private final long userID;
+
+ LoadCallable(Recommender recommender, long userID) {
+ this.recommender = recommender;
+ this.userID = userID;
+ }
+
+ @Override
+ public Void call() throws Exception {
+ recommender.recommend(userID, 10);
+ return null;
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/LoadEvaluator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/LoadEvaluator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/LoadEvaluator.java
new file mode 100644
index 0000000..2d27a37
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/LoadEvaluator.java
@@ -0,0 +1,61 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.eval;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.concurrent.Callable;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FullRunningAverageAndStdDev;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+import org.apache.mahout.cf.taste.impl.common.RunningAverageAndStdDev;
+import org.apache.mahout.cf.taste.impl.common.SamplingLongPrimitiveIterator;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.recommender.Recommender;
+
+/**
+ * Simple helper class for running load on a Recommender.
+ */
+public final class LoadEvaluator {
+
+ private LoadEvaluator() { }
+
+ public static LoadStatistics runLoad(Recommender recommender) throws TasteException {
+ return runLoad(recommender, 10);
+ }
+
+ public static LoadStatistics runLoad(Recommender recommender, int howMany) throws TasteException {
+ DataModel dataModel = recommender.getDataModel();
+ int numUsers = dataModel.getNumUsers();
+ double sampleRate = 1000.0 / numUsers;
+ LongPrimitiveIterator userSampler =
+ SamplingLongPrimitiveIterator.maybeWrapIterator(dataModel.getUserIDs(), sampleRate);
+ recommender.recommend(userSampler.next(), howMany); // Warm up
+ Collection<Callable<Void>> callables = new ArrayList<>();
+ while (userSampler.hasNext()) {
+ callables.add(new LoadCallable(recommender, userSampler.next()));
+ }
+ AtomicInteger noEstimateCounter = new AtomicInteger();
+ RunningAverageAndStdDev timing = new FullRunningAverageAndStdDev();
+ AbstractDifferenceRecommenderEvaluator.execute(callables, noEstimateCounter, timing);
+ return new LoadStatistics(timing);
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/LoadStatistics.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/LoadStatistics.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/LoadStatistics.java
new file mode 100644
index 0000000..f89160c
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/eval/LoadStatistics.java
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.eval;
+
+import org.apache.mahout.cf.taste.impl.common.RunningAverage;
+
+public final class LoadStatistics {
+
+ private final RunningAverage timing;
+
+ LoadStatistics(RunningAverage timing) {
+ this.timing = timing;
+ }
+
+ public RunningAverage getTiming() {
+ return timing;
+ }
+
+}
r***@apache.org
2018-06-28 14:54:57 UTC
Permalink
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/test/resources/wdbc/wdbc.data
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/test/resources/wdbc/wdbc.data b/community/mahout-mr/mr-examples/src/test/resources/wdbc/wdbc.data
new file mode 100644
index 0000000..8885375
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/test/resources/wdbc/wdbc.data
@@ -0,0 +1,569 @@
+842302,M,17.99,10.38,122.8,1001,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
+842517,M,20.57,17.77,132.9,1326,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0.5435,0.7339,3.398,74.08,0.005225,0.01308,0.0186,0.0134,0.01389,0.003532,24.99,23.41,158.8,1956,0.1238,0.1866,0.2416,0.186,0.275,0.08902
+84300903,M,19.69,21.25,130,1203,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0.7456,0.7869,4.585,94.03,0.00615,0.04006,0.03832,0.02058,0.0225,0.004571,23.57,25.53,152.5,1709,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
+84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,0.4956,1.156,3.445,27.23,0.00911,0.07458,0.05661,0.01867,0.05963,0.009208,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
+84358402,M,20.29,14.34,135.1,1297,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,0.7572,0.7813,5.438,94.44,0.01149,0.02461,0.05688,0.01885,0.01756,0.005115,22.54,16.67,152.2,1575,0.1374,0.205,0.4,0.1625,0.2364,0.07678
+843786,M,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.08089,0.2087,0.07613,0.3345,0.8902,2.217,27.19,0.00751,0.03345,0.03672,0.01137,0.02165,0.005082,15.47,23.75,103.4,741.6,0.1791,0.5249,0.5355,0.1741,0.3985,0.1244
+844359,M,18.25,19.98,119.6,1040,0.09463,0.109,0.1127,0.074,0.1794,0.05742,0.4467,0.7732,3.18,53.91,0.004314,0.01382,0.02254,0.01039,0.01369,0.002179,22.88,27.66,153.2,1606,0.1442,0.2576,0.3784,0.1932,0.3063,0.08368
+84458202,M,13.71,20.83,90.2,577.9,0.1189,0.1645,0.09366,0.05985,0.2196,0.07451,0.5835,1.377,3.856,50.96,0.008805,0.03029,0.02488,0.01448,0.01486,0.005412,17.06,28.14,110.6,897,0.1654,0.3682,0.2678,0.1556,0.3196,0.1151
+844981,M,13,21.82,87.5,519.8,0.1273,0.1932,0.1859,0.09353,0.235,0.07389,0.3063,1.002,2.406,24.32,0.005731,0.03502,0.03553,0.01226,0.02143,0.003749,15.49,30.73,106.2,739.3,0.1703,0.5401,0.539,0.206,0.4378,0.1072
+84501001,M,12.46,24.04,83.97,475.9,0.1186,0.2396,0.2273,0.08543,0.203,0.08243,0.2976,1.599,2.039,23.94,0.007149,0.07217,0.07743,0.01432,0.01789,0.01008,15.09,40.68,97.65,711.4,0.1853,1.058,1.105,0.221,0.4366,0.2075
+845636,M,16.02,23.24,102.7,797.8,0.08206,0.06669,0.03299,0.03323,0.1528,0.05697,0.3795,1.187,2.466,40.51,0.004029,0.009269,0.01101,0.007591,0.0146,0.003042,19.19,33.88,123.8,1150,0.1181,0.1551,0.1459,0.09975,0.2948,0.08452
+84610002,M,15.78,17.89,103.6,781,0.0971,0.1292,0.09954,0.06606,0.1842,0.06082,0.5058,0.9849,3.564,54.16,0.005771,0.04061,0.02791,0.01282,0.02008,0.004144,20.42,27.28,136.5,1299,0.1396,0.5609,0.3965,0.181,0.3792,0.1048
+846226,M,19.17,24.8,132.4,1123,0.0974,0.2458,0.2065,0.1118,0.2397,0.078,0.9555,3.568,11.07,116.2,0.003139,0.08297,0.0889,0.0409,0.04484,0.01284,20.96,29.94,151.7,1332,0.1037,0.3903,0.3639,0.1767,0.3176,0.1023
+846381,M,15.85,23.95,103.7,782.7,0.08401,0.1002,0.09938,0.05364,0.1847,0.05338,0.4033,1.078,2.903,36.58,0.009769,0.03126,0.05051,0.01992,0.02981,0.003002,16.84,27.66,112,876.5,0.1131,0.1924,0.2322,0.1119,0.2809,0.06287
+84667401,M,13.73,22.61,93.6,578.3,0.1131,0.2293,0.2128,0.08025,0.2069,0.07682,0.2121,1.169,2.061,19.21,0.006429,0.05936,0.05501,0.01628,0.01961,0.008093,15.03,32.01,108.8,697.7,0.1651,0.7725,0.6943,0.2208,0.3596,0.1431
+84799002,M,14.54,27.54,96.73,658.8,0.1139,0.1595,0.1639,0.07364,0.2303,0.07077,0.37,1.033,2.879,32.55,0.005607,0.0424,0.04741,0.0109,0.01857,0.005466,17.46,37.13,124.1,943.2,0.1678,0.6577,0.7026,0.1712,0.4218,0.1341
+848406,M,14.68,20.13,94.74,684.5,0.09867,0.072,0.07395,0.05259,0.1586,0.05922,0.4727,1.24,3.195,45.4,0.005718,0.01162,0.01998,0.01109,0.0141,0.002085,19.07,30.88,123.4,1138,0.1464,0.1871,0.2914,0.1609,0.3029,0.08216
+84862001,M,16.13,20.68,108.1,798.8,0.117,0.2022,0.1722,0.1028,0.2164,0.07356,0.5692,1.073,3.854,54.18,0.007026,0.02501,0.03188,0.01297,0.01689,0.004142,20.96,31.48,136.8,1315,0.1789,0.4233,0.4784,0.2073,0.3706,0.1142
+849014,M,19.81,22.15,130,1260,0.09831,0.1027,0.1479,0.09498,0.1582,0.05395,0.7582,1.017,5.865,112.4,0.006494,0.01893,0.03391,0.01521,0.01356,0.001997,27.32,30.88,186.8,2398,0.1512,0.315,0.5372,0.2388,0.2768,0.07615
+8510426,B,13.54,14.36,87.46,566.3,0.09779,0.08129,0.06664,0.04781,0.1885,0.05766,0.2699,0.7886,2.058,23.56,0.008462,0.0146,0.02387,0.01315,0.0198,0.0023,15.11,19.26,99.7,711.2,0.144,0.1773,0.239,0.1288,0.2977,0.07259
+8510653,B,13.08,15.71,85.63,520,0.1075,0.127,0.04568,0.0311,0.1967,0.06811,0.1852,0.7477,1.383,14.67,0.004097,0.01898,0.01698,0.00649,0.01678,0.002425,14.5,20.49,96.09,630.5,0.1312,0.2776,0.189,0.07283,0.3184,0.08183
+8510824,B,9.504,12.44,60.34,273.9,0.1024,0.06492,0.02956,0.02076,0.1815,0.06905,0.2773,0.9768,1.909,15.7,0.009606,0.01432,0.01985,0.01421,0.02027,0.002968,10.23,15.66,65.13,314.9,0.1324,0.1148,0.08867,0.06227,0.245,0.07773
+8511133,M,15.34,14.26,102.5,704.4,0.1073,0.2135,0.2077,0.09756,0.2521,0.07032,0.4388,0.7096,3.384,44.91,0.006789,0.05328,0.06446,0.02252,0.03672,0.004394,18.07,19.08,125.1,980.9,0.139,0.5954,0.6305,0.2393,0.4667,0.09946
+851509,M,21.16,23.04,137.2,1404,0.09428,0.1022,0.1097,0.08632,0.1769,0.05278,0.6917,1.127,4.303,93.99,0.004728,0.01259,0.01715,0.01038,0.01083,0.001987,29.17,35.59,188,2615,0.1401,0.26,0.3155,0.2009,0.2822,0.07526
+852552,M,16.65,21.38,110,904.6,0.1121,0.1457,0.1525,0.0917,0.1995,0.0633,0.8068,0.9017,5.455,102.6,0.006048,0.01882,0.02741,0.0113,0.01468,0.002801,26.46,31.56,177,2215,0.1805,0.3578,0.4695,0.2095,0.3613,0.09564
+852631,M,17.14,16.4,116,912.7,0.1186,0.2276,0.2229,0.1401,0.304,0.07413,1.046,0.976,7.276,111.4,0.008029,0.03799,0.03732,0.02397,0.02308,0.007444,22.25,21.4,152.4,1461,0.1545,0.3949,0.3853,0.255,0.4066,0.1059
+852763,M,14.58,21.53,97.41,644.8,0.1054,0.1868,0.1425,0.08783,0.2252,0.06924,0.2545,0.9832,2.11,21.05,0.004452,0.03055,0.02681,0.01352,0.01454,0.003711,17.62,33.21,122.4,896.9,0.1525,0.6643,0.5539,0.2701,0.4264,0.1275
+852781,M,18.61,20.25,122.1,1094,0.0944,0.1066,0.149,0.07731,0.1697,0.05699,0.8529,1.849,5.632,93.54,0.01075,0.02722,0.05081,0.01911,0.02293,0.004217,21.31,27.26,139.9,1403,0.1338,0.2117,0.3446,0.149,0.2341,0.07421
+852973,M,15.3,25.27,102.4,732.4,0.1082,0.1697,0.1683,0.08751,0.1926,0.0654,0.439,1.012,3.498,43.5,0.005233,0.03057,0.03576,0.01083,0.01768,0.002967,20.27,36.71,149.3,1269,0.1641,0.611,0.6335,0.2024,0.4027,0.09876
+853201,M,17.57,15.05,115,955.1,0.09847,0.1157,0.09875,0.07953,0.1739,0.06149,0.6003,0.8225,4.655,61.1,0.005627,0.03033,0.03407,0.01354,0.01925,0.003742,20.01,19.52,134.9,1227,0.1255,0.2812,0.2489,0.1456,0.2756,0.07919
+853401,M,18.63,25.11,124.8,1088,0.1064,0.1887,0.2319,0.1244,0.2183,0.06197,0.8307,1.466,5.574,105,0.006248,0.03374,0.05196,0.01158,0.02007,0.00456,23.15,34.01,160.5,1670,0.1491,0.4257,0.6133,0.1848,0.3444,0.09782
+853612,M,11.84,18.7,77.93,440.6,0.1109,0.1516,0.1218,0.05182,0.2301,0.07799,0.4825,1.03,3.475,41,0.005551,0.03414,0.04205,0.01044,0.02273,0.005667,16.82,28.12,119.4,888.7,0.1637,0.5775,0.6956,0.1546,0.4761,0.1402
+85382601,M,17.02,23.98,112.8,899.3,0.1197,0.1496,0.2417,0.1203,0.2248,0.06382,0.6009,1.398,3.999,67.78,0.008268,0.03082,0.05042,0.01112,0.02102,0.003854,20.88,32.09,136.1,1344,0.1634,0.3559,0.5588,0.1847,0.353,0.08482
+854002,M,19.27,26.47,127.9,1162,0.09401,0.1719,0.1657,0.07593,0.1853,0.06261,0.5558,0.6062,3.528,68.17,0.005015,0.03318,0.03497,0.009643,0.01543,0.003896,24.15,30.9,161.4,1813,0.1509,0.659,0.6091,0.1785,0.3672,0.1123
+854039,M,16.13,17.88,107,807.2,0.104,0.1559,0.1354,0.07752,0.1998,0.06515,0.334,0.6857,2.183,35.03,0.004185,0.02868,0.02664,0.009067,0.01703,0.003817,20.21,27.26,132.7,1261,0.1446,0.5804,0.5274,0.1864,0.427,0.1233
+854253,M,16.74,21.59,110.1,869.5,0.0961,0.1336,0.1348,0.06018,0.1896,0.05656,0.4615,0.9197,3.008,45.19,0.005776,0.02499,0.03695,0.01195,0.02789,0.002665,20.01,29.02,133.5,1229,0.1563,0.3835,0.5409,0.1813,0.4863,0.08633
+854268,M,14.25,21.72,93.63,633,0.09823,0.1098,0.1319,0.05598,0.1885,0.06125,0.286,1.019,2.657,24.91,0.005878,0.02995,0.04815,0.01161,0.02028,0.004022,15.89,30.36,116.2,799.6,0.1446,0.4238,0.5186,0.1447,0.3591,0.1014
+854941,B,13.03,18.42,82.61,523.8,0.08983,0.03766,0.02562,0.02923,0.1467,0.05863,0.1839,2.342,1.17,14.16,0.004352,0.004899,0.01343,0.01164,0.02671,0.001777,13.3,22.81,84.46,545.9,0.09701,0.04619,0.04833,0.05013,0.1987,0.06169
+855133,M,14.99,25.2,95.54,698.8,0.09387,0.05131,0.02398,0.02899,0.1565,0.05504,1.214,2.188,8.077,106,0.006883,0.01094,0.01818,0.01917,0.007882,0.001754,14.99,25.2,95.54,698.8,0.09387,0.05131,0.02398,0.02899,0.1565,0.05504
+855138,M,13.48,20.82,88.4,559.2,0.1016,0.1255,0.1063,0.05439,0.172,0.06419,0.213,0.5914,1.545,18.52,0.005367,0.02239,0.03049,0.01262,0.01377,0.003187,15.53,26.02,107.3,740.4,0.161,0.4225,0.503,0.2258,0.2807,0.1071
+855167,M,13.44,21.58,86.18,563,0.08162,0.06031,0.0311,0.02031,0.1784,0.05587,0.2385,0.8265,1.572,20.53,0.00328,0.01102,0.0139,0.006881,0.0138,0.001286,15.93,30.25,102.5,787.9,0.1094,0.2043,0.2085,0.1112,0.2994,0.07146
+855563,M,10.95,21.35,71.9,371.1,0.1227,0.1218,0.1044,0.05669,0.1895,0.0687,0.2366,1.428,1.822,16.97,0.008064,0.01764,0.02595,0.01037,0.01357,0.00304,12.84,35.34,87.22,514,0.1909,0.2698,0.4023,0.1424,0.2964,0.09606
+855625,M,19.07,24.81,128.3,1104,0.09081,0.219,0.2107,0.09961,0.231,0.06343,0.9811,1.666,8.83,104.9,0.006548,0.1006,0.09723,0.02638,0.05333,0.007646,24.09,33.17,177.4,1651,0.1247,0.7444,0.7242,0.2493,0.467,0.1038
+856106,M,13.28,20.28,87.32,545.2,0.1041,0.1436,0.09847,0.06158,0.1974,0.06782,0.3704,0.8249,2.427,31.33,0.005072,0.02147,0.02185,0.00956,0.01719,0.003317,17.38,28,113.1,907.2,0.153,0.3724,0.3664,0.1492,0.3739,0.1027
+85638502,M,13.17,21.81,85.42,531.5,0.09714,0.1047,0.08259,0.05252,0.1746,0.06177,0.1938,0.6123,1.334,14.49,0.00335,0.01384,0.01452,0.006853,0.01113,0.00172,16.23,29.89,105.5,740.7,0.1503,0.3904,0.3728,0.1607,0.3693,0.09618
+857010,M,18.65,17.6,123.7,1076,0.1099,0.1686,0.1974,0.1009,0.1907,0.06049,0.6289,0.6633,4.293,71.56,0.006294,0.03994,0.05554,0.01695,0.02428,0.003535,22.82,21.32,150.6,1567,0.1679,0.509,0.7345,0.2378,0.3799,0.09185
+85713702,B,8.196,16.84,51.71,201.9,0.086,0.05943,0.01588,0.005917,0.1769,0.06503,0.1563,0.9567,1.094,8.205,0.008968,0.01646,0.01588,0.005917,0.02574,0.002582,8.964,21.96,57.26,242.2,0.1297,0.1357,0.0688,0.02564,0.3105,0.07409
+85715,M,13.17,18.66,85.98,534.6,0.1158,0.1231,0.1226,0.0734,0.2128,0.06777,0.2871,0.8937,1.897,24.25,0.006532,0.02336,0.02905,0.01215,0.01743,0.003643,15.67,27.95,102.8,759.4,0.1786,0.4166,0.5006,0.2088,0.39,0.1179
+857155,B,12.05,14.63,78.04,449.3,0.1031,0.09092,0.06592,0.02749,0.1675,0.06043,0.2636,0.7294,1.848,19.87,0.005488,0.01427,0.02322,0.00566,0.01428,0.002422,13.76,20.7,89.88,582.6,0.1494,0.2156,0.305,0.06548,0.2747,0.08301
+857156,B,13.49,22.3,86.91,561,0.08752,0.07698,0.04751,0.03384,0.1809,0.05718,0.2338,1.353,1.735,20.2,0.004455,0.01382,0.02095,0.01184,0.01641,0.001956,15.15,31.82,99,698.8,0.1162,0.1711,0.2282,0.1282,0.2871,0.06917
+857343,B,11.76,21.6,74.72,427.9,0.08637,0.04966,0.01657,0.01115,0.1495,0.05888,0.4062,1.21,2.635,28.47,0.005857,0.009758,0.01168,0.007445,0.02406,0.001769,12.98,25.72,82.98,516.5,0.1085,0.08615,0.05523,0.03715,0.2433,0.06563
+857373,B,13.64,16.34,87.21,571.8,0.07685,0.06059,0.01857,0.01723,0.1353,0.05953,0.1872,0.9234,1.449,14.55,0.004477,0.01177,0.01079,0.007956,0.01325,0.002551,14.67,23.19,96.08,656.7,0.1089,0.1582,0.105,0.08586,0.2346,0.08025
+857374,B,11.94,18.24,75.71,437.6,0.08261,0.04751,0.01972,0.01349,0.1868,0.0611,0.2273,0.6329,1.52,17.47,0.00721,0.00838,0.01311,0.008,0.01996,0.002635,13.1,21.33,83.67,527.2,0.1144,0.08906,0.09203,0.06296,0.2785,0.07408
+857392,M,18.22,18.7,120.3,1033,0.1148,0.1485,0.1772,0.106,0.2092,0.0631,0.8337,1.593,4.877,98.81,0.003899,0.02961,0.02817,0.009222,0.02674,0.005126,20.6,24.13,135.1,1321,0.128,0.2297,0.2623,0.1325,0.3021,0.07987
+857438,M,15.1,22.02,97.26,712.8,0.09056,0.07081,0.05253,0.03334,0.1616,0.05684,0.3105,0.8339,2.097,29.91,0.004675,0.0103,0.01603,0.009222,0.01095,0.001629,18.1,31.69,117.7,1030,0.1389,0.2057,0.2712,0.153,0.2675,0.07873
+85759902,B,11.52,18.75,73.34,409,0.09524,0.05473,0.03036,0.02278,0.192,0.05907,0.3249,0.9591,2.183,23.47,0.008328,0.008722,0.01349,0.00867,0.03218,0.002386,12.84,22.47,81.81,506.2,0.1249,0.0872,0.09076,0.06316,0.3306,0.07036
+857637,M,19.21,18.57,125.5,1152,0.1053,0.1267,0.1323,0.08994,0.1917,0.05961,0.7275,1.193,4.837,102.5,0.006458,0.02306,0.02945,0.01538,0.01852,0.002608,26.14,28.14,170.1,2145,0.1624,0.3511,0.3879,0.2091,0.3537,0.08294
+857793,M,14.71,21.59,95.55,656.9,0.1137,0.1365,0.1293,0.08123,0.2027,0.06758,0.4226,1.15,2.735,40.09,0.003659,0.02855,0.02572,0.01272,0.01817,0.004108,17.87,30.7,115.7,985.5,0.1368,0.429,0.3587,0.1834,0.3698,0.1094
+857810,B,13.05,19.31,82.61,527.2,0.0806,0.03789,0.000692,0.004167,0.1819,0.05501,0.404,1.214,2.595,32.96,0.007491,0.008593,0.000692,0.004167,0.0219,0.00299,14.23,22.25,90.24,624.1,0.1021,0.06191,0.001845,0.01111,0.2439,0.06289
+858477,B,8.618,11.79,54.34,224.5,0.09752,0.05272,0.02061,0.007799,0.1683,0.07187,0.1559,0.5796,1.046,8.322,0.01011,0.01055,0.01981,0.005742,0.0209,0.002788,9.507,15.4,59.9,274.9,0.1733,0.1239,0.1168,0.04419,0.322,0.09026
+858970,B,10.17,14.88,64.55,311.9,0.1134,0.08061,0.01084,0.0129,0.2743,0.0696,0.5158,1.441,3.312,34.62,0.007514,0.01099,0.007665,0.008193,0.04183,0.005953,11.02,17.45,69.86,368.6,0.1275,0.09866,0.02168,0.02579,0.3557,0.0802
+858981,B,8.598,20.98,54.66,221.8,0.1243,0.08963,0.03,0.009259,0.1828,0.06757,0.3582,2.067,2.493,18.39,0.01193,0.03162,0.03,0.009259,0.03357,0.003048,9.565,27.04,62.06,273.9,0.1639,0.1698,0.09001,0.02778,0.2972,0.07712
+858986,M,14.25,22.15,96.42,645.7,0.1049,0.2008,0.2135,0.08653,0.1949,0.07292,0.7036,1.268,5.373,60.78,0.009407,0.07056,0.06899,0.01848,0.017,0.006113,17.67,29.51,119.1,959.5,0.164,0.6247,0.6922,0.1785,0.2844,0.1132
+859196,B,9.173,13.86,59.2,260.9,0.07721,0.08751,0.05988,0.0218,0.2341,0.06963,0.4098,2.265,2.608,23.52,0.008738,0.03938,0.04312,0.0156,0.04192,0.005822,10.01,19.23,65.59,310.1,0.09836,0.1678,0.1397,0.05087,0.3282,0.0849
+85922302,M,12.68,23.84,82.69,499,0.1122,0.1262,0.1128,0.06873,0.1905,0.0659,0.4255,1.178,2.927,36.46,0.007781,0.02648,0.02973,0.0129,0.01635,0.003601,17.09,33.47,111.8,888.3,0.1851,0.4061,0.4024,0.1716,0.3383,0.1031
+859283,M,14.78,23.94,97.4,668.3,0.1172,0.1479,0.1267,0.09029,0.1953,0.06654,0.3577,1.281,2.45,35.24,0.006703,0.0231,0.02315,0.01184,0.019,0.003224,17.31,33.39,114.6,925.1,0.1648,0.3416,0.3024,0.1614,0.3321,0.08911
+859464,B,9.465,21.01,60.11,269.4,0.1044,0.07773,0.02172,0.01504,0.1717,0.06899,0.2351,2.011,1.66,14.2,0.01052,0.01755,0.01714,0.009333,0.02279,0.004237,10.41,31.56,67.03,330.7,0.1548,0.1664,0.09412,0.06517,0.2878,0.09211
+859465,B,11.31,19.04,71.8,394.1,0.08139,0.04701,0.03709,0.0223,0.1516,0.05667,0.2727,0.9429,1.831,18.15,0.009282,0.009216,0.02063,0.008965,0.02183,0.002146,12.33,23.84,78,466.7,0.129,0.09148,0.1444,0.06961,0.24,0.06641
+859471,B,9.029,17.33,58.79,250.5,0.1066,0.1413,0.313,0.04375,0.2111,0.08046,0.3274,1.194,1.885,17.67,0.009549,0.08606,0.3038,0.03322,0.04197,0.009559,10.31,22.65,65.5,324.7,0.1482,0.4365,1.252,0.175,0.4228,0.1175
+859487,B,12.78,16.49,81.37,502.5,0.09831,0.05234,0.03653,0.02864,0.159,0.05653,0.2368,0.8732,1.471,18.33,0.007962,0.005612,0.01585,0.008662,0.02254,0.001906,13.46,19.76,85.67,554.9,0.1296,0.07061,0.1039,0.05882,0.2383,0.0641
+859575,M,18.94,21.31,123.6,1130,0.09009,0.1029,0.108,0.07951,0.1582,0.05461,0.7888,0.7975,5.486,96.05,0.004444,0.01652,0.02269,0.0137,0.01386,0.001698,24.86,26.58,165.9,1866,0.1193,0.2336,0.2687,0.1789,0.2551,0.06589
+859711,B,8.888,14.64,58.79,244,0.09783,0.1531,0.08606,0.02872,0.1902,0.0898,0.5262,0.8522,3.168,25.44,0.01721,0.09368,0.05671,0.01766,0.02541,0.02193,9.733,15.67,62.56,284.4,0.1207,0.2436,0.1434,0.04786,0.2254,0.1084
+859717,M,17.2,24.52,114.2,929.4,0.1071,0.183,0.1692,0.07944,0.1927,0.06487,0.5907,1.041,3.705,69.47,0.00582,0.05616,0.04252,0.01127,0.01527,0.006299,23.32,33.82,151.6,1681,0.1585,0.7394,0.6566,0.1899,0.3313,0.1339
+859983,M,13.8,15.79,90.43,584.1,0.1007,0.128,0.07789,0.05069,0.1662,0.06566,0.2787,0.6205,1.957,23.35,0.004717,0.02065,0.01759,0.009206,0.0122,0.00313,16.57,20.86,110.3,812.4,0.1411,0.3542,0.2779,0.1383,0.2589,0.103
+8610175,B,12.31,16.52,79.19,470.9,0.09172,0.06829,0.03372,0.02272,0.172,0.05914,0.2505,1.025,1.74,19.68,0.004854,0.01819,0.01826,0.007965,0.01386,0.002304,14.11,23.21,89.71,611.1,0.1176,0.1843,0.1703,0.0866,0.2618,0.07609
+8610404,M,16.07,19.65,104.1,817.7,0.09168,0.08424,0.09769,0.06638,0.1798,0.05391,0.7474,1.016,5.029,79.25,0.01082,0.02203,0.035,0.01809,0.0155,0.001948,19.77,24.56,128.8,1223,0.15,0.2045,0.2829,0.152,0.265,0.06387
+8610629,B,13.53,10.94,87.91,559.2,0.1291,0.1047,0.06877,0.06556,0.2403,0.06641,0.4101,1.014,2.652,32.65,0.0134,0.02839,0.01162,0.008239,0.02572,0.006164,14.08,12.49,91.36,605.5,0.1451,0.1379,0.08539,0.07407,0.271,0.07191
+8610637,M,18.05,16.15,120.2,1006,0.1065,0.2146,0.1684,0.108,0.2152,0.06673,0.9806,0.5505,6.311,134.8,0.00794,0.05839,0.04658,0.0207,0.02591,0.007054,22.39,18.91,150.1,1610,0.1478,0.5634,0.3786,0.2102,0.3751,0.1108
+8610862,M,20.18,23.97,143.7,1245,0.1286,0.3454,0.3754,0.1604,0.2906,0.08142,0.9317,1.885,8.649,116.4,0.01038,0.06835,0.1091,0.02593,0.07895,0.005987,23.37,31.72,170.3,1623,0.1639,0.6164,0.7681,0.2508,0.544,0.09964
+8610908,B,12.86,18,83.19,506.3,0.09934,0.09546,0.03889,0.02315,0.1718,0.05997,0.2655,1.095,1.778,20.35,0.005293,0.01661,0.02071,0.008179,0.01748,0.002848,14.24,24.82,91.88,622.1,0.1289,0.2141,0.1731,0.07926,0.2779,0.07918
+861103,B,11.45,20.97,73.81,401.5,0.1102,0.09362,0.04591,0.02233,0.1842,0.07005,0.3251,2.174,2.077,24.62,0.01037,0.01706,0.02586,0.007506,0.01816,0.003976,13.11,32.16,84.53,525.1,0.1557,0.1676,0.1755,0.06127,0.2762,0.08851
+8611161,B,13.34,15.86,86.49,520,0.1078,0.1535,0.1169,0.06987,0.1942,0.06902,0.286,1.016,1.535,12.96,0.006794,0.03575,0.0398,0.01383,0.02134,0.004603,15.53,23.19,96.66,614.9,0.1536,0.4791,0.4858,0.1708,0.3527,0.1016
+8611555,M,25.22,24.91,171.5,1878,0.1063,0.2665,0.3339,0.1845,0.1829,0.06782,0.8973,1.474,7.382,120,0.008166,0.05693,0.0573,0.0203,0.01065,0.005893,30,33.62,211.7,2562,0.1573,0.6076,0.6476,0.2867,0.2355,0.1051
+8611792,M,19.1,26.29,129.1,1132,0.1215,0.1791,0.1937,0.1469,0.1634,0.07224,0.519,2.91,5.801,67.1,0.007545,0.0605,0.02134,0.01843,0.03056,0.01039,20.33,32.72,141.3,1298,0.1392,0.2817,0.2432,0.1841,0.2311,0.09203
+8612080,B,12,15.65,76.95,443.3,0.09723,0.07165,0.04151,0.01863,0.2079,0.05968,0.2271,1.255,1.441,16.16,0.005969,0.01812,0.02007,0.007027,0.01972,0.002607,13.67,24.9,87.78,567.9,0.1377,0.2003,0.2267,0.07632,0.3379,0.07924
+8612399,M,18.46,18.52,121.1,1075,0.09874,0.1053,0.1335,0.08795,0.2132,0.06022,0.6997,1.475,4.782,80.6,0.006471,0.01649,0.02806,0.0142,0.0237,0.003755,22.93,27.68,152.2,1603,0.1398,0.2089,0.3157,0.1642,0.3695,0.08579
+86135501,M,14.48,21.46,94.25,648.2,0.09444,0.09947,0.1204,0.04938,0.2075,0.05636,0.4204,2.22,3.301,38.87,0.009369,0.02983,0.05371,0.01761,0.02418,0.003249,16.21,29.25,108.4,808.9,0.1306,0.1976,0.3349,0.1225,0.302,0.06846
+86135502,M,19.02,24.59,122,1076,0.09029,0.1206,0.1468,0.08271,0.1953,0.05629,0.5495,0.6636,3.055,57.65,0.003872,0.01842,0.0371,0.012,0.01964,0.003337,24.56,30.41,152.9,1623,0.1249,0.3206,0.5755,0.1956,0.3956,0.09288
+861597,B,12.36,21.8,79.78,466.1,0.08772,0.09445,0.06015,0.03745,0.193,0.06404,0.2978,1.502,2.203,20.95,0.007112,0.02493,0.02703,0.01293,0.01958,0.004463,13.83,30.5,91.46,574.7,0.1304,0.2463,0.2434,0.1205,0.2972,0.09261
+861598,B,14.64,15.24,95.77,651.9,0.1132,0.1339,0.09966,0.07064,0.2116,0.06346,0.5115,0.7372,3.814,42.76,0.005508,0.04412,0.04436,0.01623,0.02427,0.004841,16.34,18.24,109.4,803.6,0.1277,0.3089,0.2604,0.1397,0.3151,0.08473
+861648,B,14.62,24.02,94.57,662.7,0.08974,0.08606,0.03102,0.02957,0.1685,0.05866,0.3721,1.111,2.279,33.76,0.004868,0.01818,0.01121,0.008606,0.02085,0.002893,16.11,29.11,102.9,803.7,0.1115,0.1766,0.09189,0.06946,0.2522,0.07246
+861799,M,15.37,22.76,100.2,728.2,0.092,0.1036,0.1122,0.07483,0.1717,0.06097,0.3129,0.8413,2.075,29.44,0.009882,0.02444,0.04531,0.01763,0.02471,0.002142,16.43,25.84,107.5,830.9,0.1257,0.1997,0.2846,0.1476,0.2556,0.06828
+861853,B,13.27,14.76,84.74,551.7,0.07355,0.05055,0.03261,0.02648,0.1386,0.05318,0.4057,1.153,2.701,36.35,0.004481,0.01038,0.01358,0.01082,0.01069,0.001435,16.36,22.35,104.5,830.6,0.1006,0.1238,0.135,0.1001,0.2027,0.06206
+862009,B,13.45,18.3,86.6,555.1,0.1022,0.08165,0.03974,0.0278,0.1638,0.0571,0.295,1.373,2.099,25.22,0.005884,0.01491,0.01872,0.009366,0.01884,0.001817,15.1,25.94,97.59,699.4,0.1339,0.1751,0.1381,0.07911,0.2678,0.06603
+862028,M,15.06,19.83,100.3,705.6,0.1039,0.1553,0.17,0.08815,0.1855,0.06284,0.4768,0.9644,3.706,47.14,0.00925,0.03715,0.04867,0.01851,0.01498,0.00352,18.23,24.23,123.5,1025,0.1551,0.4203,0.5203,0.2115,0.2834,0.08234
+86208,M,20.26,23.03,132.4,1264,0.09078,0.1313,0.1465,0.08683,0.2095,0.05649,0.7576,1.509,4.554,87.87,0.006016,0.03482,0.04232,0.01269,0.02657,0.004411,24.22,31.59,156.1,1750,0.119,0.3539,0.4098,0.1573,0.3689,0.08368
+86211,B,12.18,17.84,77.79,451.1,0.1045,0.07057,0.0249,0.02941,0.19,0.06635,0.3661,1.511,2.41,24.44,0.005433,0.01179,0.01131,0.01519,0.0222,0.003408,12.83,20.92,82.14,495.2,0.114,0.09358,0.0498,0.05882,0.2227,0.07376
+862261,B,9.787,19.94,62.11,294.5,0.1024,0.05301,0.006829,0.007937,0.135,0.0689,0.335,2.043,2.132,20.05,0.01113,0.01463,0.005308,0.00525,0.01801,0.005667,10.92,26.29,68.81,366.1,0.1316,0.09473,0.02049,0.02381,0.1934,0.08988
+862485,B,11.6,12.84,74.34,412.6,0.08983,0.07525,0.04196,0.0335,0.162,0.06582,0.2315,0.5391,1.475,15.75,0.006153,0.0133,0.01693,0.006884,0.01651,0.002551,13.06,17.16,82.96,512.5,0.1431,0.1851,0.1922,0.08449,0.2772,0.08756
+862548,M,14.42,19.77,94.48,642.5,0.09752,0.1141,0.09388,0.05839,0.1879,0.0639,0.2895,1.851,2.376,26.85,0.008005,0.02895,0.03321,0.01424,0.01462,0.004452,16.33,30.86,109.5,826.4,0.1431,0.3026,0.3194,0.1565,0.2718,0.09353
+862717,M,13.61,24.98,88.05,582.7,0.09488,0.08511,0.08625,0.04489,0.1609,0.05871,0.4565,1.29,2.861,43.14,0.005872,0.01488,0.02647,0.009921,0.01465,0.002355,16.99,35.27,108.6,906.5,0.1265,0.1943,0.3169,0.1184,0.2651,0.07397
+862722,B,6.981,13.43,43.79,143.5,0.117,0.07568,0,0,0.193,0.07818,0.2241,1.508,1.553,9.833,0.01019,0.01084,0,0,0.02659,0.0041,7.93,19.54,50.41,185.2,0.1584,0.1202,0,0,0.2932,0.09382
+862965,B,12.18,20.52,77.22,458.7,0.08013,0.04038,0.02383,0.0177,0.1739,0.05677,0.1924,1.571,1.183,14.68,0.00508,0.006098,0.01069,0.006797,0.01447,0.001532,13.34,32.84,84.58,547.8,0.1123,0.08862,0.1145,0.07431,0.2694,0.06878
+862980,B,9.876,19.4,63.95,298.3,0.1005,0.09697,0.06154,0.03029,0.1945,0.06322,0.1803,1.222,1.528,11.77,0.009058,0.02196,0.03029,0.01112,0.01609,0.00357,10.76,26.83,72.22,361.2,0.1559,0.2302,0.2644,0.09749,0.2622,0.0849
+862989,B,10.49,19.29,67.41,336.1,0.09989,0.08578,0.02995,0.01201,0.2217,0.06481,0.355,1.534,2.302,23.13,0.007595,0.02219,0.0288,0.008614,0.0271,0.003451,11.54,23.31,74.22,402.8,0.1219,0.1486,0.07987,0.03203,0.2826,0.07552
+863030,M,13.11,15.56,87.21,530.2,0.1398,0.1765,0.2071,0.09601,0.1925,0.07692,0.3908,0.9238,2.41,34.66,0.007162,0.02912,0.05473,0.01388,0.01547,0.007098,16.31,22.4,106.4,827.2,0.1862,0.4099,0.6376,0.1986,0.3147,0.1405
+863031,B,11.64,18.33,75.17,412.5,0.1142,0.1017,0.0707,0.03485,0.1801,0.0652,0.306,1.657,2.155,20.62,0.00854,0.0231,0.02945,0.01398,0.01565,0.00384,13.14,29.26,85.51,521.7,0.1688,0.266,0.2873,0.1218,0.2806,0.09097
+863270,B,12.36,18.54,79.01,466.7,0.08477,0.06815,0.02643,0.01921,0.1602,0.06066,0.1199,0.8944,0.8484,9.227,0.003457,0.01047,0.01167,0.005558,0.01251,0.001356,13.29,27.49,85.56,544.1,0.1184,0.1963,0.1937,0.08442,0.2983,0.07185
+86355,M,22.27,19.67,152.8,1509,0.1326,0.2768,0.4264,0.1823,0.2556,0.07039,1.215,1.545,10.05,170,0.006515,0.08668,0.104,0.0248,0.03112,0.005037,28.4,28.01,206.8,2360,0.1701,0.6997,0.9608,0.291,0.4055,0.09789
+864018,B,11.34,21.26,72.48,396.5,0.08759,0.06575,0.05133,0.01899,0.1487,0.06529,0.2344,0.9861,1.597,16.41,0.009113,0.01557,0.02443,0.006435,0.01568,0.002477,13.01,29.15,83.99,518.1,0.1699,0.2196,0.312,0.08278,0.2829,0.08832
+864033,B,9.777,16.99,62.5,290.2,0.1037,0.08404,0.04334,0.01778,0.1584,0.07065,0.403,1.424,2.747,22.87,0.01385,0.02932,0.02722,0.01023,0.03281,0.004638,11.05,21.47,71.68,367,0.1467,0.1765,0.13,0.05334,0.2533,0.08468
+86408,B,12.63,20.76,82.15,480.4,0.09933,0.1209,0.1065,0.06021,0.1735,0.0707,0.3424,1.803,2.711,20.48,0.01291,0.04042,0.05101,0.02295,0.02144,0.005891,13.33,25.47,89,527.4,0.1287,0.225,0.2216,0.1105,0.2226,0.08486
+86409,B,14.26,19.65,97.83,629.9,0.07837,0.2233,0.3003,0.07798,0.1704,0.07769,0.3628,1.49,3.399,29.25,0.005298,0.07446,0.1435,0.02292,0.02566,0.01298,15.3,23.73,107,709,0.08949,0.4193,0.6783,0.1505,0.2398,0.1082
+864292,B,10.51,20.19,68.64,334.2,0.1122,0.1303,0.06476,0.03068,0.1922,0.07782,0.3336,1.86,2.041,19.91,0.01188,0.03747,0.04591,0.01544,0.02287,0.006792,11.16,22.75,72.62,374.4,0.13,0.2049,0.1295,0.06136,0.2383,0.09026
+864496,B,8.726,15.83,55.84,230.9,0.115,0.08201,0.04132,0.01924,0.1649,0.07633,0.1665,0.5864,1.354,8.966,0.008261,0.02213,0.03259,0.0104,0.01708,0.003806,9.628,19.62,64.48,284.4,0.1724,0.2364,0.2456,0.105,0.2926,0.1017
+864685,B,11.93,21.53,76.53,438.6,0.09768,0.07849,0.03328,0.02008,0.1688,0.06194,0.3118,0.9227,2,24.79,0.007803,0.02507,0.01835,0.007711,0.01278,0.003856,13.67,26.15,87.54,583,0.15,0.2399,0.1503,0.07247,0.2438,0.08541
+864726,B,8.95,15.76,58.74,245.2,0.09462,0.1243,0.09263,0.02308,0.1305,0.07163,0.3132,0.9789,3.28,16.94,0.01835,0.0676,0.09263,0.02308,0.02384,0.005601,9.414,17.07,63.34,270,0.1179,0.1879,0.1544,0.03846,0.1652,0.07722
+864729,M,14.87,16.67,98.64,682.5,0.1162,0.1649,0.169,0.08923,0.2157,0.06768,0.4266,0.9489,2.989,41.18,0.006985,0.02563,0.03011,0.01271,0.01602,0.003884,18.81,27.37,127.1,1095,0.1878,0.448,0.4704,0.2027,0.3585,0.1065
+864877,M,15.78,22.91,105.7,782.6,0.1155,0.1752,0.2133,0.09479,0.2096,0.07331,0.552,1.072,3.598,58.63,0.008699,0.03976,0.0595,0.0139,0.01495,0.005984,20.19,30.5,130.3,1272,0.1855,0.4925,0.7356,0.2034,0.3274,0.1252
+865128,M,17.95,20.01,114.2,982,0.08402,0.06722,0.07293,0.05596,0.2129,0.05025,0.5506,1.214,3.357,54.04,0.004024,0.008422,0.02291,0.009863,0.05014,0.001902,20.58,27.83,129.2,1261,0.1072,0.1202,0.2249,0.1185,0.4882,0.06111
+865137,B,11.41,10.82,73.34,403.3,0.09373,0.06685,0.03512,0.02623,0.1667,0.06113,0.1408,0.4607,1.103,10.5,0.00604,0.01529,0.01514,0.00646,0.01344,0.002206,12.82,15.97,83.74,510.5,0.1548,0.239,0.2102,0.08958,0.3016,0.08523
+86517,M,18.66,17.12,121.4,1077,0.1054,0.11,0.1457,0.08665,0.1966,0.06213,0.7128,1.581,4.895,90.47,0.008102,0.02101,0.03342,0.01601,0.02045,0.00457,22.25,24.9,145.4,1549,0.1503,0.2291,0.3272,0.1674,0.2894,0.08456
+865423,M,24.25,20.2,166.2,1761,0.1447,0.2867,0.4268,0.2012,0.2655,0.06877,1.509,3.12,9.807,233,0.02333,0.09806,0.1278,0.01822,0.04547,0.009875,26.02,23.99,180.9,2073,0.1696,0.4244,0.5803,0.2248,0.3222,0.08009
+865432,B,14.5,10.89,94.28,640.7,0.1101,0.1099,0.08842,0.05778,0.1856,0.06402,0.2929,0.857,1.928,24.19,0.003818,0.01276,0.02882,0.012,0.0191,0.002808,15.7,15.98,102.8,745.5,0.1313,0.1788,0.256,0.1221,0.2889,0.08006
+865468,B,13.37,16.39,86.1,553.5,0.07115,0.07325,0.08092,0.028,0.1422,0.05823,0.1639,1.14,1.223,14.66,0.005919,0.0327,0.04957,0.01038,0.01208,0.004076,14.26,22.75,91.99,632.1,0.1025,0.2531,0.3308,0.08978,0.2048,0.07628
+86561,B,13.85,17.21,88.44,588.7,0.08785,0.06136,0.0142,0.01141,0.1614,0.0589,0.2185,0.8561,1.495,17.91,0.004599,0.009169,0.009127,0.004814,0.01247,0.001708,15.49,23.58,100.3,725.9,0.1157,0.135,0.08115,0.05104,0.2364,0.07182
+866083,M,13.61,24.69,87.76,572.6,0.09258,0.07862,0.05285,0.03085,0.1761,0.0613,0.231,1.005,1.752,19.83,0.004088,0.01174,0.01796,0.00688,0.01323,0.001465,16.89,35.64,113.2,848.7,0.1471,0.2884,0.3796,0.1329,0.347,0.079
+866203,M,19,18.91,123.4,1138,0.08217,0.08028,0.09271,0.05627,0.1946,0.05044,0.6896,1.342,5.216,81.23,0.004428,0.02731,0.0404,0.01361,0.0203,0.002686,22.32,25.73,148.2,1538,0.1021,0.2264,0.3207,0.1218,0.2841,0.06541
+866458,B,15.1,16.39,99.58,674.5,0.115,0.1807,0.1138,0.08534,0.2001,0.06467,0.4309,1.068,2.796,39.84,0.009006,0.04185,0.03204,0.02258,0.02353,0.004984,16.11,18.33,105.9,762.6,0.1386,0.2883,0.196,0.1423,0.259,0.07779
+866674,M,19.79,25.12,130.4,1192,0.1015,0.1589,0.2545,0.1149,0.2202,0.06113,0.4953,1.199,2.765,63.33,0.005033,0.03179,0.04755,0.01043,0.01578,0.003224,22.63,33.58,148.7,1589,0.1275,0.3861,0.5673,0.1732,0.3305,0.08465
+866714,B,12.19,13.29,79.08,455.8,0.1066,0.09509,0.02855,0.02882,0.188,0.06471,0.2005,0.8163,1.973,15.24,0.006773,0.02456,0.01018,0.008094,0.02662,0.004143,13.34,17.81,91.38,545.2,0.1427,0.2585,0.09915,0.08187,0.3469,0.09241
+8670,M,15.46,19.48,101.7,748.9,0.1092,0.1223,0.1466,0.08087,0.1931,0.05796,0.4743,0.7859,3.094,48.31,0.00624,0.01484,0.02813,0.01093,0.01397,0.002461,19.26,26,124.9,1156,0.1546,0.2394,0.3791,0.1514,0.2837,0.08019
+86730502,M,16.16,21.54,106.2,809.8,0.1008,0.1284,0.1043,0.05613,0.216,0.05891,0.4332,1.265,2.844,43.68,0.004877,0.01952,0.02219,0.009231,0.01535,0.002373,19.47,31.68,129.7,1175,0.1395,0.3055,0.2992,0.1312,0.348,0.07619
+867387,B,15.71,13.93,102,761.7,0.09462,0.09462,0.07135,0.05933,0.1816,0.05723,0.3117,0.8155,1.972,27.94,0.005217,0.01515,0.01678,0.01268,0.01669,0.00233,17.5,19.25,114.3,922.8,0.1223,0.1949,0.1709,0.1374,0.2723,0.07071
+867739,M,18.45,21.91,120.2,1075,0.0943,0.09709,0.1153,0.06847,0.1692,0.05727,0.5959,1.202,3.766,68.35,0.006001,0.01422,0.02855,0.009148,0.01492,0.002205,22.52,31.39,145.6,1590,0.1465,0.2275,0.3965,0.1379,0.3109,0.0761
+868202,M,12.77,22.47,81.72,506.3,0.09055,0.05761,0.04711,0.02704,0.1585,0.06065,0.2367,1.38,1.457,19.87,0.007499,0.01202,0.02332,0.00892,0.01647,0.002629,14.49,33.37,92.04,653.6,0.1419,0.1523,0.2177,0.09331,0.2829,0.08067
+868223,B,11.71,16.67,74.72,423.6,0.1051,0.06095,0.03592,0.026,0.1339,0.05945,0.4489,2.508,3.258,34.37,0.006578,0.0138,0.02662,0.01307,0.01359,0.003707,13.33,25.48,86.16,546.7,0.1271,0.1028,0.1046,0.06968,0.1712,0.07343
+868682,B,11.43,15.39,73.06,399.8,0.09639,0.06889,0.03503,0.02875,0.1734,0.05865,0.1759,0.9938,1.143,12.67,0.005133,0.01521,0.01434,0.008602,0.01501,0.001588,12.32,22.02,79.93,462,0.119,0.1648,0.1399,0.08476,0.2676,0.06765
+868826,M,14.95,17.57,96.85,678.1,0.1167,0.1305,0.1539,0.08624,0.1957,0.06216,1.296,1.452,8.419,101.9,0.01,0.0348,0.06577,0.02801,0.05168,0.002887,18.55,21.43,121.4,971.4,0.1411,0.2164,0.3355,0.1667,0.3414,0.07147
+868871,B,11.28,13.39,73,384.8,0.1164,0.1136,0.04635,0.04796,0.1771,0.06072,0.3384,1.343,1.851,26.33,0.01127,0.03498,0.02187,0.01965,0.0158,0.003442,11.92,15.77,76.53,434,0.1367,0.1822,0.08669,0.08611,0.2102,0.06784
+868999,B,9.738,11.97,61.24,288.5,0.0925,0.04102,0,0,0.1903,0.06422,0.1988,0.496,1.218,12.26,0.00604,0.005656,0,0,0.02277,0.00322,10.62,14.1,66.53,342.9,0.1234,0.07204,0,0,0.3105,0.08151
+869104,M,16.11,18.05,105.1,813,0.09721,0.1137,0.09447,0.05943,0.1861,0.06248,0.7049,1.332,4.533,74.08,0.00677,0.01938,0.03067,0.01167,0.01875,0.003434,19.92,25.27,129,1233,0.1314,0.2236,0.2802,0.1216,0.2792,0.08158
+869218,B,11.43,17.31,73.66,398,0.1092,0.09486,0.02031,0.01861,0.1645,0.06562,0.2843,1.908,1.937,21.38,0.006664,0.01735,0.01158,0.00952,0.02282,0.003526,12.78,26.76,82.66,503,0.1413,0.1792,0.07708,0.06402,0.2584,0.08096
+869224,B,12.9,15.92,83.74,512.2,0.08677,0.09509,0.04894,0.03088,0.1778,0.06235,0.2143,0.7712,1.689,16.64,0.005324,0.01563,0.0151,0.007584,0.02104,0.001887,14.48,21.82,97.17,643.8,0.1312,0.2548,0.209,0.1012,0.3549,0.08118
+869254,B,10.75,14.97,68.26,355.3,0.07793,0.05139,0.02251,0.007875,0.1399,0.05688,0.2525,1.239,1.806,17.74,0.006547,0.01781,0.02018,0.005612,0.01671,0.00236,11.95,20.72,77.79,441.2,0.1076,0.1223,0.09755,0.03413,0.23,0.06769
+869476,B,11.9,14.65,78.11,432.8,0.1152,0.1296,0.0371,0.03003,0.1995,0.07839,0.3962,0.6538,3.021,25.03,0.01017,0.04741,0.02789,0.0111,0.03127,0.009423,13.15,16.51,86.26,509.6,0.1424,0.2517,0.0942,0.06042,0.2727,0.1036
+869691,M,11.8,16.58,78.99,432,0.1091,0.17,0.1659,0.07415,0.2678,0.07371,0.3197,1.426,2.281,24.72,0.005427,0.03633,0.04649,0.01843,0.05628,0.004635,13.74,26.38,91.93,591.7,0.1385,0.4092,0.4504,0.1865,0.5774,0.103
+86973701,B,14.95,18.77,97.84,689.5,0.08138,0.1167,0.0905,0.03562,0.1744,0.06493,0.422,1.909,3.271,39.43,0.00579,0.04877,0.05303,0.01527,0.03356,0.009368,16.25,25.47,107.1,809.7,0.0997,0.2521,0.25,0.08405,0.2852,0.09218
+86973702,B,14.44,15.18,93.97,640.1,0.0997,0.1021,0.08487,0.05532,0.1724,0.06081,0.2406,0.7394,2.12,21.2,0.005706,0.02297,0.03114,0.01493,0.01454,0.002528,15.85,19.85,108.6,766.9,0.1316,0.2735,0.3103,0.1599,0.2691,0.07683
+869931,B,13.74,17.91,88.12,585,0.07944,0.06376,0.02881,0.01329,0.1473,0.0558,0.25,0.7574,1.573,21.47,0.002838,0.01592,0.0178,0.005828,0.01329,0.001976,15.34,22.46,97.19,725.9,0.09711,0.1824,0.1564,0.06019,0.235,0.07014
+871001501,B,13,20.78,83.51,519.4,0.1135,0.07589,0.03136,0.02645,0.254,0.06087,0.4202,1.322,2.873,34.78,0.007017,0.01142,0.01949,0.01153,0.02951,0.001533,14.16,24.11,90.82,616.7,0.1297,0.1105,0.08112,0.06296,0.3196,0.06435
+871001502,B,8.219,20.7,53.27,203.9,0.09405,0.1305,0.1321,0.02168,0.2222,0.08261,0.1935,1.962,1.243,10.21,0.01243,0.05416,0.07753,0.01022,0.02309,0.01178,9.092,29.72,58.08,249.8,0.163,0.431,0.5381,0.07879,0.3322,0.1486
+8710441,B,9.731,15.34,63.78,300.2,0.1072,0.1599,0.4108,0.07857,0.2548,0.09296,0.8245,2.664,4.073,49.85,0.01097,0.09586,0.396,0.05279,0.03546,0.02984,11.02,19.49,71.04,380.5,0.1292,0.2772,0.8216,0.1571,0.3108,0.1259
+87106,B,11.15,13.08,70.87,381.9,0.09754,0.05113,0.01982,0.01786,0.183,0.06105,0.2251,0.7815,1.429,15.48,0.009019,0.008985,0.01196,0.008232,0.02388,0.001619,11.99,16.3,76.25,440.8,0.1341,0.08971,0.07116,0.05506,0.2859,0.06772
+8711002,B,13.15,15.34,85.31,538.9,0.09384,0.08498,0.09293,0.03483,0.1822,0.06207,0.271,0.7927,1.819,22.79,0.008584,0.02017,0.03047,0.009536,0.02769,0.003479,14.77,20.5,97.67,677.3,0.1478,0.2256,0.3009,0.09722,0.3849,0.08633
+8711003,B,12.25,17.94,78.27,460.3,0.08654,0.06679,0.03885,0.02331,0.197,0.06228,0.22,0.9823,1.484,16.51,0.005518,0.01562,0.01994,0.007924,0.01799,0.002484,13.59,25.22,86.6,564.2,0.1217,0.1788,0.1943,0.08211,0.3113,0.08132
+8711202,M,17.68,20.74,117.4,963.7,0.1115,0.1665,0.1855,0.1054,0.1971,0.06166,0.8113,1.4,5.54,93.91,0.009037,0.04954,0.05206,0.01841,0.01778,0.004968,20.47,25.11,132.9,1302,0.1418,0.3498,0.3583,0.1515,0.2463,0.07738
+8711216,B,16.84,19.46,108.4,880.2,0.07445,0.07223,0.0515,0.02771,0.1844,0.05268,0.4789,2.06,3.479,46.61,0.003443,0.02661,0.03056,0.0111,0.0152,0.001519,18.22,28.07,120.3,1032,0.08774,0.171,0.1882,0.08436,0.2527,0.05972
+871122,B,12.06,12.74,76.84,448.6,0.09311,0.05241,0.01972,0.01963,0.159,0.05907,0.1822,0.7285,1.171,13.25,0.005528,0.009789,0.008342,0.006273,0.01465,0.00253,13.14,18.41,84.08,532.8,0.1275,0.1232,0.08636,0.07025,0.2514,0.07898
+871149,B,10.9,12.96,68.69,366.8,0.07515,0.03718,0.00309,0.006588,0.1442,0.05743,0.2818,0.7614,1.808,18.54,0.006142,0.006134,0.001835,0.003576,0.01637,0.002665,12.36,18.2,78.07,470,0.1171,0.08294,0.01854,0.03953,0.2738,0.07685
+8711561,B,11.75,20.18,76.1,419.8,0.1089,0.1141,0.06843,0.03738,0.1993,0.06453,0.5018,1.693,3.926,38.34,0.009433,0.02405,0.04167,0.01152,0.03397,0.005061,13.32,26.21,88.91,543.9,0.1358,0.1892,0.1956,0.07909,0.3168,0.07987
+8711803,M,19.19,15.94,126.3,1157,0.08694,0.1185,0.1193,0.09667,0.1741,0.05176,1,0.6336,6.971,119.3,0.009406,0.03055,0.04344,0.02794,0.03156,0.003362,22.03,17.81,146.6,1495,0.1124,0.2016,0.2264,0.1777,0.2443,0.06251
+871201,M,19.59,18.15,130.7,1214,0.112,0.1666,0.2508,0.1286,0.2027,0.06082,0.7364,1.048,4.792,97.07,0.004057,0.02277,0.04029,0.01303,0.01686,0.003318,26.73,26.39,174.9,2232,0.1438,0.3846,0.681,0.2247,0.3643,0.09223
+8712064,B,12.34,22.22,79.85,464.5,0.1012,0.1015,0.0537,0.02822,0.1551,0.06761,0.2949,1.656,1.955,21.55,0.01134,0.03175,0.03125,0.01135,0.01879,0.005348,13.58,28.68,87.36,553,0.1452,0.2338,0.1688,0.08194,0.2268,0.09082
+8712289,M,23.27,22.04,152.1,1686,0.08439,0.1145,0.1324,0.09702,0.1801,0.05553,0.6642,0.8561,4.603,97.85,0.00491,0.02544,0.02822,0.01623,0.01956,0.00374,28.01,28.22,184.2,2403,0.1228,0.3583,0.3948,0.2346,0.3589,0.09187
+8712291,B,14.97,19.76,95.5,690.2,0.08421,0.05352,0.01947,0.01939,0.1515,0.05266,0.184,1.065,1.286,16.64,0.003634,0.007983,0.008268,0.006432,0.01924,0.00152,15.98,25.82,102.3,782.1,0.1045,0.09995,0.0775,0.05754,0.2646,0.06085
+87127,B,10.8,9.71,68.77,357.6,0.09594,0.05736,0.02531,0.01698,0.1381,0.064,0.1728,0.4064,1.126,11.48,0.007809,0.009816,0.01099,0.005344,0.01254,0.00212,11.6,12.02,73.66,414,0.1436,0.1257,0.1047,0.04603,0.209,0.07699
+8712729,M,16.78,18.8,109.3,886.3,0.08865,0.09182,0.08422,0.06576,0.1893,0.05534,0.599,1.391,4.129,67.34,0.006123,0.0247,0.02626,0.01604,0.02091,0.003493,20.05,26.3,130.7,1260,0.1168,0.2119,0.2318,0.1474,0.281,0.07228
+8712766,M,17.47,24.68,116.1,984.6,0.1049,0.1603,0.2159,0.1043,0.1538,0.06365,1.088,1.41,7.337,122.3,0.006174,0.03634,0.04644,0.01569,0.01145,0.00512,23.14,32.33,155.3,1660,0.1376,0.383,0.489,0.1721,0.216,0.093
+8712853,B,14.97,16.95,96.22,685.9,0.09855,0.07885,0.02602,0.03781,0.178,0.0565,0.2713,1.217,1.893,24.28,0.00508,0.0137,0.007276,0.009073,0.0135,0.001706,16.11,23,104.6,793.7,0.1216,0.1637,0.06648,0.08485,0.2404,0.06428
+87139402,B,12.32,12.39,78.85,464.1,0.1028,0.06981,0.03987,0.037,0.1959,0.05955,0.236,0.6656,1.67,17.43,0.008045,0.0118,0.01683,0.01241,0.01924,0.002248,13.5,15.64,86.97,549.1,0.1385,0.1266,0.1242,0.09391,0.2827,0.06771
+87163,M,13.43,19.63,85.84,565.4,0.09048,0.06288,0.05858,0.03438,0.1598,0.05671,0.4697,1.147,3.142,43.4,0.006003,0.01063,0.02151,0.009443,0.0152,0.001868,17.98,29.87,116.6,993.6,0.1401,0.1546,0.2644,0.116,0.2884,0.07371
+87164,M,15.46,11.89,102.5,736.9,0.1257,0.1555,0.2032,0.1097,0.1966,0.07069,0.4209,0.6583,2.805,44.64,0.005393,0.02321,0.04303,0.0132,0.01792,0.004168,18.79,17.04,125,1102,0.1531,0.3583,0.583,0.1827,0.3216,0.101
+871641,B,11.08,14.71,70.21,372.7,0.1006,0.05743,0.02363,0.02583,0.1566,0.06669,0.2073,1.805,1.377,19.08,0.01496,0.02121,0.01453,0.01583,0.03082,0.004785,11.35,16.82,72.01,396.5,0.1216,0.0824,0.03938,0.04306,0.1902,0.07313
+871642,B,10.66,15.15,67.49,349.6,0.08792,0.04302,0,0,0.1928,0.05975,0.3309,1.925,2.155,21.98,0.008713,0.01017,0,0,0.03265,0.001002,11.54,19.2,73.2,408.3,0.1076,0.06791,0,0,0.271,0.06164
+872113,B,8.671,14.45,54.42,227.2,0.09138,0.04276,0,0,0.1722,0.06724,0.2204,0.7873,1.435,11.36,0.009172,0.008007,0,0,0.02711,0.003399,9.262,17.04,58.36,259.2,0.1162,0.07057,0,0,0.2592,0.07848
+872608,B,9.904,18.06,64.6,302.4,0.09699,0.1294,0.1307,0.03716,0.1669,0.08116,0.4311,2.261,3.132,27.48,0.01286,0.08808,0.1197,0.0246,0.0388,0.01792,11.26,24.39,73.07,390.2,0.1301,0.295,0.3486,0.0991,0.2614,0.1162
+87281702,M,16.46,20.11,109.3,832.9,0.09831,0.1556,0.1793,0.08866,0.1794,0.06323,0.3037,1.284,2.482,31.59,0.006627,0.04094,0.05371,0.01813,0.01682,0.004584,17.79,28.45,123.5,981.2,0.1415,0.4667,0.5862,0.2035,0.3054,0.09519
+873357,B,13.01,22.22,82.01,526.4,0.06251,0.01938,0.001595,0.001852,0.1395,0.05234,0.1731,1.142,1.101,14.34,0.003418,0.002252,0.001595,0.001852,0.01613,0.0009683,14,29.02,88.18,608.8,0.08125,0.03432,0.007977,0.009259,0.2295,0.05843
+873586,B,12.81,13.06,81.29,508.8,0.08739,0.03774,0.009193,0.0133,0.1466,0.06133,0.2889,0.9899,1.778,21.79,0.008534,0.006364,0.00618,0.007408,0.01065,0.003351,13.63,16.15,86.7,570.7,0.1162,0.05445,0.02758,0.0399,0.1783,0.07319
+873592,M,27.22,21.87,182.1,2250,0.1094,0.1914,0.2871,0.1878,0.18,0.0577,0.8361,1.481,5.82,128.7,0.004631,0.02537,0.03109,0.01241,0.01575,0.002747,33.12,32.85,220.8,3216,0.1472,0.4034,0.534,0.2688,0.2856,0.08082
+873593,M,21.09,26.57,142.7,1311,0.1141,0.2832,0.2487,0.1496,0.2395,0.07398,0.6298,0.7629,4.414,81.46,0.004253,0.04759,0.03872,0.01567,0.01798,0.005295,26.68,33.48,176.5,2089,0.1491,0.7584,0.678,0.2903,0.4098,0.1284
+873701,M,15.7,20.31,101.2,766.6,0.09597,0.08799,0.06593,0.05189,0.1618,0.05549,0.3699,1.15,2.406,40.98,0.004626,0.02263,0.01954,0.009767,0.01547,0.00243,20.11,32.82,129.3,1269,0.1414,0.3547,0.2902,0.1541,0.3437,0.08631
+873843,B,11.41,14.92,73.53,402,0.09059,0.08155,0.06181,0.02361,0.1167,0.06217,0.3344,1.108,1.902,22.77,0.007356,0.03728,0.05915,0.01712,0.02165,0.004784,12.37,17.7,79.12,467.2,0.1121,0.161,0.1648,0.06296,0.1811,0.07427
+873885,M,15.28,22.41,98.92,710.6,0.09057,0.1052,0.05375,0.03263,0.1727,0.06317,0.2054,0.4956,1.344,19.53,0.00329,0.01395,0.01774,0.006009,0.01172,0.002575,17.8,28.03,113.8,973.1,0.1301,0.3299,0.363,0.1226,0.3175,0.09772
+874158,B,10.08,15.11,63.76,317.5,0.09267,0.04695,0.001597,0.002404,0.1703,0.06048,0.4245,1.268,2.68,26.43,0.01439,0.012,0.001597,0.002404,0.02538,0.00347,11.87,21.18,75.39,437,0.1521,0.1019,0.00692,0.01042,0.2933,0.07697
+874217,M,18.31,18.58,118.6,1041,0.08588,0.08468,0.08169,0.05814,0.1621,0.05425,0.2577,0.4757,1.817,28.92,0.002866,0.009181,0.01412,0.006719,0.01069,0.001087,21.31,26.36,139.2,1410,0.1234,0.2445,0.3538,0.1571,0.3206,0.06938
+874373,B,11.71,17.19,74.68,420.3,0.09774,0.06141,0.03809,0.03239,0.1516,0.06095,0.2451,0.7655,1.742,17.86,0.006905,0.008704,0.01978,0.01185,0.01897,0.001671,13.01,21.39,84.42,521.5,0.1323,0.104,0.1521,0.1099,0.2572,0.07097
+874662,B,11.81,17.39,75.27,428.9,0.1007,0.05562,0.02353,0.01553,0.1718,0.0578,0.1859,1.926,1.011,14.47,0.007831,0.008776,0.01556,0.00624,0.03139,0.001988,12.57,26.48,79.57,489.5,0.1356,0.1,0.08803,0.04306,0.32,0.06576
+874839,B,12.3,15.9,78.83,463.7,0.0808,0.07253,0.03844,0.01654,0.1667,0.05474,0.2382,0.8355,1.687,18.32,0.005996,0.02212,0.02117,0.006433,0.02025,0.001725,13.35,19.59,86.65,546.7,0.1096,0.165,0.1423,0.04815,0.2482,0.06306
+874858,M,14.22,23.12,94.37,609.9,0.1075,0.2413,0.1981,0.06618,0.2384,0.07542,0.286,2.11,2.112,31.72,0.00797,0.1354,0.1166,0.01666,0.05113,0.01172,15.74,37.18,106.4,762.4,0.1533,0.9327,0.8488,0.1772,0.5166,0.1446
+875093,B,12.77,21.41,82.02,507.4,0.08749,0.06601,0.03112,0.02864,0.1694,0.06287,0.7311,1.748,5.118,53.65,0.004571,0.0179,0.02176,0.01757,0.03373,0.005875,13.75,23.5,89.04,579.5,0.09388,0.08978,0.05186,0.04773,0.2179,0.06871
+875099,B,9.72,18.22,60.73,288.1,0.0695,0.02344,0,0,0.1653,0.06447,0.3539,4.885,2.23,21.69,0.001713,0.006736,0,0,0.03799,0.001688,9.968,20.83,62.25,303.8,0.07117,0.02729,0,0,0.1909,0.06559
+875263,M,12.34,26.86,81.15,477.4,0.1034,0.1353,0.1085,0.04562,0.1943,0.06937,0.4053,1.809,2.642,34.44,0.009098,0.03845,0.03763,0.01321,0.01878,0.005672,15.65,39.34,101.7,768.9,0.1785,0.4706,0.4425,0.1459,0.3215,0.1205
+87556202,M,14.86,23.21,100.4,671.4,0.1044,0.198,0.1697,0.08878,0.1737,0.06672,0.2796,0.9622,3.591,25.2,0.008081,0.05122,0.05551,0.01883,0.02545,0.004312,16.08,27.78,118.6,784.7,0.1316,0.4648,0.4589,0.1727,0.3,0.08701
+875878,B,12.91,16.33,82.53,516.4,0.07941,0.05366,0.03873,0.02377,0.1829,0.05667,0.1942,0.9086,1.493,15.75,0.005298,0.01587,0.02321,0.00842,0.01853,0.002152,13.88,22,90.81,600.6,0.1097,0.1506,0.1764,0.08235,0.3024,0.06949
+875938,M,13.77,22.29,90.63,588.9,0.12,0.1267,0.1385,0.06526,0.1834,0.06877,0.6191,2.112,4.906,49.7,0.0138,0.03348,0.04665,0.0206,0.02689,0.004306,16.39,34.01,111.6,806.9,0.1737,0.3122,0.3809,0.1673,0.308,0.09333
+877159,M,18.08,21.84,117.4,1024,0.07371,0.08642,0.1103,0.05778,0.177,0.0534,0.6362,1.305,4.312,76.36,0.00553,0.05296,0.0611,0.01444,0.0214,0.005036,19.76,24.7,129.1,1228,0.08822,0.1963,0.2535,0.09181,0.2369,0.06558
+877486,M,19.18,22.49,127.5,1148,0.08523,0.1428,0.1114,0.06772,0.1767,0.05529,0.4357,1.073,3.833,54.22,0.005524,0.03698,0.02706,0.01221,0.01415,0.003397,23.36,32.06,166.4,1688,0.1322,0.5601,0.3865,0.1708,0.3193,0.09221
+877500,M,14.45,20.22,94.49,642.7,0.09872,0.1206,0.118,0.0598,0.195,0.06466,0.2092,0.6509,1.446,19.42,0.004044,0.01597,0.02,0.007303,0.01522,0.001976,18.33,30.12,117.9,1044,0.1552,0.4056,0.4967,0.1838,0.4753,0.1013
+877501,B,12.23,19.56,78.54,461,0.09586,0.08087,0.04187,0.04107,0.1979,0.06013,0.3534,1.326,2.308,27.24,0.007514,0.01779,0.01401,0.0114,0.01503,0.003338,14.44,28.36,92.15,638.4,0.1429,0.2042,0.1377,0.108,0.2668,0.08174
+877989,M,17.54,19.32,115.1,951.6,0.08968,0.1198,0.1036,0.07488,0.1506,0.05491,0.3971,0.8282,3.088,40.73,0.00609,0.02569,0.02713,0.01345,0.01594,0.002658,20.42,25.84,139.5,1239,0.1381,0.342,0.3508,0.1939,0.2928,0.07867
+878796,M,23.29,26.67,158.9,1685,0.1141,0.2084,0.3523,0.162,0.22,0.06229,0.5539,1.56,4.667,83.16,0.009327,0.05121,0.08958,0.02465,0.02175,0.005195,25.12,32.68,177,1986,0.1536,0.4167,0.7892,0.2733,0.3198,0.08762
+87880,M,13.81,23.75,91.56,597.8,0.1323,0.1768,0.1558,0.09176,0.2251,0.07421,0.5648,1.93,3.909,52.72,0.008824,0.03108,0.03112,0.01291,0.01998,0.004506,19.2,41.85,128.5,1153,0.2226,0.5209,0.4646,0.2013,0.4432,0.1086
+87930,B,12.47,18.6,81.09,481.9,0.09965,0.1058,0.08005,0.03821,0.1925,0.06373,0.3961,1.044,2.497,30.29,0.006953,0.01911,0.02701,0.01037,0.01782,0.003586,14.97,24.64,96.05,677.9,0.1426,0.2378,0.2671,0.1015,0.3014,0.0875
+879523,M,15.12,16.68,98.78,716.6,0.08876,0.09588,0.0755,0.04079,0.1594,0.05986,0.2711,0.3621,1.974,26.44,0.005472,0.01919,0.02039,0.00826,0.01523,0.002881,17.77,20.24,117.7,989.5,0.1491,0.3331,0.3327,0.1252,0.3415,0.0974
+879804,B,9.876,17.27,62.92,295.4,0.1089,0.07232,0.01756,0.01952,0.1934,0.06285,0.2137,1.342,1.517,12.33,0.009719,0.01249,0.007975,0.007527,0.0221,0.002472,10.42,23.22,67.08,331.6,0.1415,0.1247,0.06213,0.05588,0.2989,0.0738
+879830,M,17.01,20.26,109.7,904.3,0.08772,0.07304,0.0695,0.0539,0.2026,0.05223,0.5858,0.8554,4.106,68.46,0.005038,0.01503,0.01946,0.01123,0.02294,0.002581,19.8,25.05,130,1210,0.1111,0.1486,0.1932,0.1096,0.3275,0.06469
+8810158,B,13.11,22.54,87.02,529.4,0.1002,0.1483,0.08705,0.05102,0.185,0.0731,0.1931,0.9223,1.491,15.09,0.005251,0.03041,0.02526,0.008304,0.02514,0.004198,14.55,29.16,99.48,639.3,0.1349,0.4402,0.3162,0.1126,0.4128,0.1076
+8810436,B,15.27,12.91,98.17,725.5,0.08182,0.0623,0.05892,0.03157,0.1359,0.05526,0.2134,0.3628,1.525,20,0.004291,0.01236,0.01841,0.007373,0.009539,0.001656,17.38,15.92,113.7,932.7,0.1222,0.2186,0.2962,0.1035,0.232,0.07474
+881046502,M,20.58,22.14,134.7,1290,0.0909,0.1348,0.164,0.09561,0.1765,0.05024,0.8601,1.48,7.029,111.7,0.008124,0.03611,0.05489,0.02765,0.03176,0.002365,23.24,27.84,158.3,1656,0.1178,0.292,0.3861,0.192,0.2909,0.05865
+8810528,B,11.84,18.94,75.51,428,0.08871,0.069,0.02669,0.01393,0.1533,0.06057,0.2222,0.8652,1.444,17.12,0.005517,0.01727,0.02045,0.006747,0.01616,0.002922,13.3,24.99,85.22,546.3,0.128,0.188,0.1471,0.06913,0.2535,0.07993
+8810703,M,28.11,18.47,188.5,2499,0.1142,0.1516,0.3201,0.1595,0.1648,0.05525,2.873,1.476,21.98,525.6,0.01345,0.02772,0.06389,0.01407,0.04783,0.004476,28.11,18.47,188.5,2499,0.1142,0.1516,0.3201,0.1595,0.1648,0.05525
+881094802,M,17.42,25.56,114.5,948,0.1006,0.1146,0.1682,0.06597,0.1308,0.05866,0.5296,1.667,3.767,58.53,0.03113,0.08555,0.1438,0.03927,0.02175,0.01256,18.07,28.07,120.4,1021,0.1243,0.1793,0.2803,0.1099,0.1603,0.06818
+8810955,M,14.19,23.81,92.87,610.7,0.09463,0.1306,0.1115,0.06462,0.2235,0.06433,0.4207,1.845,3.534,31,0.01088,0.0371,0.03688,0.01627,0.04499,0.004768,16.86,34.85,115,811.3,0.1559,0.4059,0.3744,0.1772,0.4724,0.1026
+8810987,M,13.86,16.93,90.96,578.9,0.1026,0.1517,0.09901,0.05602,0.2106,0.06916,0.2563,1.194,1.933,22.69,0.00596,0.03438,0.03909,0.01435,0.01939,0.00456,15.75,26.93,104.4,750.1,0.146,0.437,0.4636,0.1654,0.363,0.1059
+8811523,B,11.89,18.35,77.32,432.2,0.09363,0.1154,0.06636,0.03142,0.1967,0.06314,0.2963,1.563,2.087,21.46,0.008872,0.04192,0.05946,0.01785,0.02793,0.004775,13.25,27.1,86.2,531.2,0.1405,0.3046,0.2806,0.1138,0.3397,0.08365
+8811779,B,10.2,17.48,65.05,321.2,0.08054,0.05907,0.05774,0.01071,0.1964,0.06315,0.3567,1.922,2.747,22.79,0.00468,0.0312,0.05774,0.01071,0.0256,0.004613,11.48,24.47,75.4,403.7,0.09527,0.1397,0.1925,0.03571,0.2868,0.07809
+8811842,M,19.8,21.56,129.7,1230,0.09383,0.1306,0.1272,0.08691,0.2094,0.05581,0.9553,1.186,6.487,124.4,0.006804,0.03169,0.03446,0.01712,0.01897,0.004045,25.73,28.64,170.3,2009,0.1353,0.3235,0.3617,0.182,0.307,0.08255
+88119002,M,19.53,32.47,128,1223,0.0842,0.113,0.1145,0.06637,0.1428,0.05313,0.7392,1.321,4.722,109.9,0.005539,0.02644,0.02664,0.01078,0.01332,0.002256,27.9,45.41,180.2,2477,0.1408,0.4097,0.3995,0.1625,0.2713,0.07568
+8812816,B,13.65,13.16,87.88,568.9,0.09646,0.08711,0.03888,0.02563,0.136,0.06344,0.2102,0.4336,1.391,17.4,0.004133,0.01695,0.01652,0.006659,0.01371,0.002735,15.34,16.35,99.71,706.2,0.1311,0.2474,0.1759,0.08056,0.238,0.08718
+8812818,B,13.56,13.9,88.59,561.3,0.1051,0.1192,0.0786,0.04451,0.1962,0.06303,0.2569,0.4981,2.011,21.03,0.005851,0.02314,0.02544,0.00836,0.01842,0.002918,14.98,17.13,101.1,686.6,0.1376,0.2698,0.2577,0.0909,0.3065,0.08177
+8812844,B,10.18,17.53,65.12,313.1,0.1061,0.08502,0.01768,0.01915,0.191,0.06908,0.2467,1.217,1.641,15.05,0.007899,0.014,0.008534,0.007624,0.02637,0.003761,11.17,22.84,71.94,375.6,0.1406,0.144,0.06572,0.05575,0.3055,0.08797
+8812877,M,15.75,20.25,102.6,761.3,0.1025,0.1204,0.1147,0.06462,0.1935,0.06303,0.3473,0.9209,2.244,32.19,0.004766,0.02374,0.02384,0.008637,0.01772,0.003131,19.56,30.29,125.9,1088,0.1552,0.448,0.3976,0.1479,0.3993,0.1064
+8813129,B,13.27,17.02,84.55,546.4,0.08445,0.04994,0.03554,0.02456,0.1496,0.05674,0.2927,0.8907,2.044,24.68,0.006032,0.01104,0.02259,0.009057,0.01482,0.002496,15.14,23.6,98.84,708.8,0.1276,0.1311,0.1786,0.09678,0.2506,0.07623
+88143502,B,14.34,13.47,92.51,641.2,0.09906,0.07624,0.05724,0.04603,0.2075,0.05448,0.522,0.8121,3.763,48.29,0.007089,0.01428,0.0236,0.01286,0.02266,0.001463,16.77,16.9,110.4,873.2,0.1297,0.1525,0.1632,0.1087,0.3062,0.06072
+88147101,B,10.44,15.46,66.62,329.6,0.1053,0.07722,0.006643,0.01216,0.1788,0.0645,0.1913,0.9027,1.208,11.86,0.006513,0.008061,0.002817,0.004972,0.01502,0.002821,11.52,19.8,73.47,395.4,0.1341,0.1153,0.02639,0.04464,0.2615,0.08269
+88147102,B,15,15.51,97.45,684.5,0.08371,0.1096,0.06505,0.0378,0.1881,0.05907,0.2318,0.4966,2.276,19.88,0.004119,0.03207,0.03644,0.01155,0.01391,0.003204,16.41,19.31,114.2,808.2,0.1136,0.3627,0.3402,0.1379,0.2954,0.08362
+88147202,B,12.62,23.97,81.35,496.4,0.07903,0.07529,0.05438,0.02036,0.1514,0.06019,0.2449,1.066,1.445,18.51,0.005169,0.02294,0.03016,0.008691,0.01365,0.003407,14.2,31.31,90.67,624,0.1227,0.3454,0.3911,0.118,0.2826,0.09585
+881861,M,12.83,22.33,85.26,503.2,0.1088,0.1799,0.1695,0.06861,0.2123,0.07254,0.3061,1.069,2.257,25.13,0.006983,0.03858,0.04683,0.01499,0.0168,0.005617,15.2,30.15,105.3,706,0.1777,0.5343,0.6282,0.1977,0.3407,0.1243
+881972,M,17.05,19.08,113.4,895,0.1141,0.1572,0.191,0.109,0.2131,0.06325,0.2959,0.679,2.153,31.98,0.005532,0.02008,0.03055,0.01384,0.01177,0.002336,19.59,24.89,133.5,1189,0.1703,0.3934,0.5018,0.2543,0.3109,0.09061
+88199202,B,11.32,27.08,71.76,395.7,0.06883,0.03813,0.01633,0.003125,0.1869,0.05628,0.121,0.8927,1.059,8.605,0.003653,0.01647,0.01633,0.003125,0.01537,0.002052,12.08,33.75,79.82,452.3,0.09203,0.1432,0.1089,0.02083,0.2849,0.07087
+88203002,B,11.22,33.81,70.79,386.8,0.0778,0.03574,0.004967,0.006434,0.1845,0.05828,0.2239,1.647,1.489,15.46,0.004359,0.006813,0.003223,0.003419,0.01916,0.002534,12.36,41.78,78.44,470.9,0.09994,0.06885,0.02318,0.03002,0.2911,0.07307
+88206102,M,20.51,27.81,134.4,1319,0.09159,0.1074,0.1554,0.0834,0.1448,0.05592,0.524,1.189,3.767,70.01,0.00502,0.02062,0.03457,0.01091,0.01298,0.002887,24.47,37.38,162.7,1872,0.1223,0.2761,0.4146,0.1563,0.2437,0.08328
+882488,B,9.567,15.91,60.21,279.6,0.08464,0.04087,0.01652,0.01667,0.1551,0.06403,0.2152,0.8301,1.215,12.64,0.01164,0.0104,0.01186,0.009623,0.02383,0.00354,10.51,19.16,65.74,335.9,0.1504,0.09515,0.07161,0.07222,0.2757,0.08178
+88249602,B,14.03,21.25,89.79,603.4,0.0907,0.06945,0.01462,0.01896,0.1517,0.05835,0.2589,1.503,1.667,22.07,0.007389,0.01383,0.007302,0.01004,0.01263,0.002925,15.33,30.28,98.27,715.5,0.1287,0.1513,0.06231,0.07963,0.2226,0.07617
+88299702,M,23.21,26.97,153.5,1670,0.09509,0.1682,0.195,0.1237,0.1909,0.06309,1.058,0.9635,7.247,155.8,0.006428,0.02863,0.04497,0.01716,0.0159,0.003053,31.01,34.51,206,2944,0.1481,0.4126,0.582,0.2593,0.3103,0.08677
+883263,M,20.48,21.46,132.5,1306,0.08355,0.08348,0.09042,0.06022,0.1467,0.05177,0.6874,1.041,5.144,83.5,0.007959,0.03133,0.04257,0.01671,0.01341,0.003933,24.22,26.17,161.7,1750,0.1228,0.2311,0.3158,0.1445,0.2238,0.07127
+883270,B,14.22,27.85,92.55,623.9,0.08223,0.1039,0.1103,0.04408,0.1342,0.06129,0.3354,2.324,2.105,29.96,0.006307,0.02845,0.0385,0.01011,0.01185,0.003589,15.75,40.54,102.5,764,0.1081,0.2426,0.3064,0.08219,0.189,0.07796
+88330202,M,17.46,39.28,113.4,920.6,0.09812,0.1298,0.1417,0.08811,0.1809,0.05966,0.5366,0.8561,3.002,49,0.00486,0.02785,0.02602,0.01374,0.01226,0.002759,22.51,44.87,141.2,1408,0.1365,0.3735,0.3241,0.2066,0.2853,0.08496
+88350402,B,13.64,15.6,87.38,575.3,0.09423,0.0663,0.04705,0.03731,0.1717,0.0566,0.3242,0.6612,1.996,27.19,0.00647,0.01248,0.0181,0.01103,0.01898,0.001794,14.85,19.05,94.11,683.4,0.1278,0.1291,0.1533,0.09222,0.253,0.0651
+883539,B,12.42,15.04,78.61,476.5,0.07926,0.03393,0.01053,0.01108,0.1546,0.05754,0.1153,0.6745,0.757,9.006,0.003265,0.00493,0.006493,0.003762,0.0172,0.00136,13.2,20.37,83.85,543.4,0.1037,0.07776,0.06243,0.04052,0.2901,0.06783
+883852,B,11.3,18.19,73.93,389.4,0.09592,0.1325,0.1548,0.02854,0.2054,0.07669,0.2428,1.642,2.369,16.39,0.006663,0.05914,0.0888,0.01314,0.01995,0.008675,12.58,27.96,87.16,472.9,0.1347,0.4848,0.7436,0.1218,0.3308,0.1297
+88411702,B,13.75,23.77,88.54,590,0.08043,0.06807,0.04697,0.02344,0.1773,0.05429,0.4347,1.057,2.829,39.93,0.004351,0.02667,0.03371,0.01007,0.02598,0.003087,15.01,26.34,98,706,0.09368,0.1442,0.1359,0.06106,0.2663,0.06321
+884180,M,19.4,23.5,129.1,1155,0.1027,0.1558,0.2049,0.08886,0.1978,0.06,0.5243,1.802,4.037,60.41,0.01061,0.03252,0.03915,0.01559,0.02186,0.003949,21.65,30.53,144.9,1417,0.1463,0.2968,0.3458,0.1564,0.292,0.07614
+884437,B,10.48,19.86,66.72,337.7,0.107,0.05971,0.04831,0.0307,0.1737,0.0644,0.3719,2.612,2.517,23.22,0.01604,0.01386,0.01865,0.01133,0.03476,0.00356,11.48,29.46,73.68,402.8,0.1515,0.1026,0.1181,0.06736,0.2883,0.07748
+884448,B,13.2,17.43,84.13,541.6,0.07215,0.04524,0.04336,0.01105,0.1487,0.05635,0.163,1.601,0.873,13.56,0.006261,0.01569,0.03079,0.005383,0.01962,0.00225,13.94,27.82,88.28,602,0.1101,0.1508,0.2298,0.0497,0.2767,0.07198
+884626,B,12.89,14.11,84.95,512.2,0.0876,0.1346,0.1374,0.0398,0.1596,0.06409,0.2025,0.4402,2.393,16.35,0.005501,0.05592,0.08158,0.0137,0.01266,0.007555,14.39,17.7,105,639.1,0.1254,0.5849,0.7727,0.1561,0.2639,0.1178
+88466802,B,10.65,25.22,68.01,347,0.09657,0.07234,0.02379,0.01615,0.1897,0.06329,0.2497,1.493,1.497,16.64,0.007189,0.01035,0.01081,0.006245,0.02158,0.002619,12.25,35.19,77.98,455.7,0.1499,0.1398,0.1125,0.06136,0.3409,0.08147
+884689,B,11.52,14.93,73.87,406.3,0.1013,0.07808,0.04328,0.02929,0.1883,0.06168,0.2562,1.038,1.686,18.62,0.006662,0.01228,0.02105,0.01006,0.01677,0.002784,12.65,21.19,80.88,491.8,0.1389,0.1582,0.1804,0.09608,0.2664,0.07809
+884948,M,20.94,23.56,138.9,1364,0.1007,0.1606,0.2712,0.131,0.2205,0.05898,1.004,0.8208,6.372,137.9,0.005283,0.03908,0.09518,0.01864,0.02401,0.005002,25.58,27,165.3,2010,0.1211,0.3172,0.6991,0.2105,0.3126,0.07849
+88518501,B,11.5,18.45,73.28,407.4,0.09345,0.05991,0.02638,0.02069,0.1834,0.05934,0.3927,0.8429,2.684,26.99,0.00638,0.01065,0.01245,0.009175,0.02292,0.001461,12.97,22.46,83.12,508.9,0.1183,0.1049,0.08105,0.06544,0.274,0.06487
+885429,M,19.73,19.82,130.7,1206,0.1062,0.1849,0.2417,0.0974,0.1733,0.06697,0.7661,0.78,4.115,92.81,0.008482,0.05057,0.068,0.01971,0.01467,0.007259,25.28,25.59,159.8,1933,0.171,0.5955,0.8489,0.2507,0.2749,0.1297
+8860702,M,17.3,17.08,113,928.2,0.1008,0.1041,0.1266,0.08353,0.1813,0.05613,0.3093,0.8568,2.193,33.63,0.004757,0.01503,0.02332,0.01262,0.01394,0.002362,19.85,25.09,130.9,1222,0.1416,0.2405,0.3378,0.1857,0.3138,0.08113
+886226,M,19.45,19.33,126.5,1169,0.1035,0.1188,0.1379,0.08591,0.1776,0.05647,0.5959,0.6342,3.797,71,0.004649,0.018,0.02749,0.01267,0.01365,0.00255,25.7,24.57,163.1,1972,0.1497,0.3161,0.4317,0.1999,0.3379,0.0895
+886452,M,13.96,17.05,91.43,602.4,0.1096,0.1279,0.09789,0.05246,0.1908,0.0613,0.425,0.8098,2.563,35.74,0.006351,0.02679,0.03119,0.01342,0.02062,0.002695,16.39,22.07,108.1,826,0.1512,0.3262,0.3209,0.1374,0.3068,0.07957
+88649001,M,19.55,28.77,133.6,1207,0.0926,0.2063,0.1784,0.1144,0.1893,0.06232,0.8426,1.199,7.158,106.4,0.006356,0.04765,0.03863,0.01519,0.01936,0.005252,25.05,36.27,178.6,1926,0.1281,0.5329,0.4251,0.1941,0.2818,0.1005
+886776,M,15.32,17.27,103.2,713.3,0.1335,0.2284,0.2448,0.1242,0.2398,0.07596,0.6592,1.059,4.061,59.46,0.01015,0.04588,0.04983,0.02127,0.01884,0.00866,17.73,22.66,119.8,928.8,0.1765,0.4503,0.4429,0.2229,0.3258,0.1191
+887181,M,15.66,23.2,110.2,773.5,0.1109,0.3114,0.3176,0.1377,0.2495,0.08104,1.292,2.454,10.12,138.5,0.01236,0.05995,0.08232,0.03024,0.02337,0.006042,19.85,31.64,143.7,1226,0.1504,0.5172,0.6181,0.2462,0.3277,0.1019
+88725602,M,15.53,33.56,103.7,744.9,0.1063,0.1639,0.1751,0.08399,0.2091,0.0665,0.2419,1.278,1.903,23.02,0.005345,0.02556,0.02889,0.01022,0.009947,0.003359,18.49,49.54,126.3,1035,0.1883,0.5564,0.5703,0.2014,0.3512,0.1204
+887549,M,20.31,27.06,132.9,1288,0.1,0.1088,0.1519,0.09333,0.1814,0.05572,0.3977,1.033,2.587,52.34,0.005043,0.01578,0.02117,0.008185,0.01282,0.001892,24.33,39.16,162.3,1844,0.1522,0.2945,0.3788,0.1697,0.3151,0.07999
+888264,M,17.35,23.06,111,933.1,0.08662,0.0629,0.02891,0.02837,0.1564,0.05307,0.4007,1.317,2.577,44.41,0.005726,0.01106,0.01246,0.007671,0.01411,0.001578,19.85,31.47,128.2,1218,0.124,0.1486,0.1211,0.08235,0.2452,0.06515
+888570,M,17.29,22.13,114.4,947.8,0.08999,0.1273,0.09697,0.07507,0.2108,0.05464,0.8348,1.633,6.146,90.94,0.006717,0.05981,0.04638,0.02149,0.02747,0.005838,20.39,27.24,137.9,1295,0.1134,0.2867,0.2298,0.1528,0.3067,0.07484
+889403,M,15.61,19.38,100,758.6,0.0784,0.05616,0.04209,0.02847,0.1547,0.05443,0.2298,0.9988,1.534,22.18,0.002826,0.009105,0.01311,0.005174,0.01013,0.001345,17.91,31.67,115.9,988.6,0.1084,0.1807,0.226,0.08568,0.2683,0.06829
+889719,M,17.19,22.07,111.6,928.3,0.09726,0.08995,0.09061,0.06527,0.1867,0.0558,0.4203,0.7383,2.819,45.42,0.004493,0.01206,0.02048,0.009875,0.01144,0.001575,21.58,29.33,140.5,1436,0.1558,0.2567,0.3889,0.1984,0.3216,0.0757
+88995002,M,20.73,31.12,135.7,1419,0.09469,0.1143,0.1367,0.08646,0.1769,0.05674,1.172,1.617,7.749,199.7,0.004551,0.01478,0.02143,0.00928,0.01367,0.002299,32.49,47.16,214,3432,0.1401,0.2644,0.3442,0.1659,0.2868,0.08218
+8910251,B,10.6,18.95,69.28,346.4,0.09688,0.1147,0.06387,0.02642,0.1922,0.06491,0.4505,1.197,3.43,27.1,0.00747,0.03581,0.03354,0.01365,0.03504,0.003318,11.88,22.94,78.28,424.8,0.1213,0.2515,0.1916,0.07926,0.294,0.07587
+8910499,B,13.59,21.84,87.16,561,0.07956,0.08259,0.04072,0.02142,0.1635,0.05859,0.338,1.916,2.591,26.76,0.005436,0.02406,0.03099,0.009919,0.0203,0.003009,14.8,30.04,97.66,661.5,0.1005,0.173,0.1453,0.06189,0.2446,0.07024
+8910506,B,12.87,16.21,82.38,512.2,0.09425,0.06219,0.039,0.01615,0.201,0.05769,0.2345,1.219,1.546,18.24,0.005518,0.02178,0.02589,0.00633,0.02593,0.002157,13.9,23.64,89.27,597.5,0.1256,0.1808,0.1992,0.0578,0.3604,0.07062
+8910720,B,10.71,20.39,69.5,344.9,0.1082,0.1289,0.08448,0.02867,0.1668,0.06862,0.3198,1.489,2.23,20.74,0.008902,0.04785,0.07339,0.01745,0.02728,0.00761,11.69,25.21,76.51,410.4,0.1335,0.255,0.2534,0.086,0.2605,0.08701
+8910721,B,14.29,16.82,90.3,632.6,0.06429,0.02675,0.00725,0.00625,0.1508,0.05376,0.1302,0.7198,0.8439,10.77,0.003492,0.00371,0.004826,0.003608,0.01536,0.001381,14.91,20.65,94.44,684.6,0.08567,0.05036,0.03866,0.03333,0.2458,0.0612
+8910748,B,11.29,13.04,72.23,388,0.09834,0.07608,0.03265,0.02755,0.1769,0.0627,0.1904,0.5293,1.164,13.17,0.006472,0.01122,0.01282,0.008849,0.01692,0.002817,12.32,16.18,78.27,457.5,0.1358,0.1507,0.1275,0.0875,0.2733,0.08022
+8910988,M,21.75,20.99,147.3,1491,0.09401,0.1961,0.2195,0.1088,0.1721,0.06194,1.167,1.352,8.867,156.8,0.005687,0.0496,0.06329,0.01561,0.01924,0.004614,28.19,28.18,195.9,2384,0.1272,0.4725,0.5807,0.1841,0.2833,0.08858
+8910996,B,9.742,15.67,61.5,289.9,0.09037,0.04689,0.01103,0.01407,0.2081,0.06312,0.2684,1.409,1.75,16.39,0.0138,0.01067,0.008347,0.009472,0.01798,0.004261,10.75,20.88,68.09,355.2,0.1467,0.0937,0.04043,0.05159,0.2841,0.08175
+8911163,M,17.93,24.48,115.2,998.9,0.08855,0.07027,0.05699,0.04744,0.1538,0.0551,0.4212,1.433,2.765,45.81,0.005444,0.01169,0.01622,0.008522,0.01419,0.002751,20.92,34.69,135.1,1320,0.1315,0.1806,0.208,0.1136,0.2504,0.07948
+8911164,B,11.89,17.36,76.2,435.6,0.1225,0.0721,0.05929,0.07404,0.2015,0.05875,0.6412,2.293,4.021,48.84,0.01418,0.01489,0.01267,0.0191,0.02678,0.003002,12.4,18.99,79.46,472.4,0.1359,0.08368,0.07153,0.08946,0.222,0.06033
+8911230,B,11.33,14.16,71.79,396.6,0.09379,0.03872,0.001487,0.003333,0.1954,0.05821,0.2375,1.28,1.565,17.09,0.008426,0.008998,0.001487,0.003333,0.02358,0.001627,12.2,18.99,77.37,458,0.1259,0.07348,0.004955,0.01111,0.2758,0.06386
+8911670,M,18.81,19.98,120.9,1102,0.08923,0.05884,0.0802,0.05843,0.155,0.04996,0.3283,0.828,2.363,36.74,0.007571,0.01114,0.02623,0.01463,0.0193,0.001676,19.96,24.3,129,1236,0.1243,0.116,0.221,0.1294,0.2567,0.05737
+8911800,B,13.59,17.84,86.24,572.3,0.07948,0.04052,0.01997,0.01238,0.1573,0.0552,0.258,1.166,1.683,22.22,0.003741,0.005274,0.01065,0.005044,0.01344,0.001126,15.5,26.1,98.91,739.1,0.105,0.07622,0.106,0.05185,0.2335,0.06263
+8911834,B,13.85,15.18,88.99,587.4,0.09516,0.07688,0.04479,0.03711,0.211,0.05853,0.2479,0.9195,1.83,19.41,0.004235,0.01541,0.01457,0.01043,0.01528,0.001593,14.98,21.74,98.37,670,0.1185,0.1724,0.1456,0.09993,0.2955,0.06912
+8912049,M,19.16,26.6,126.2,1138,0.102,0.1453,0.1921,0.09664,0.1902,0.0622,0.6361,1.001,4.321,69.65,0.007392,0.02449,0.03988,0.01293,0.01435,0.003446,23.72,35.9,159.8,1724,0.1782,0.3841,0.5754,0.1872,0.3258,0.0972
+8912055,B,11.74,14.02,74.24,427.3,0.07813,0.0434,0.02245,0.02763,0.2101,0.06113,0.5619,1.268,3.717,37.83,0.008034,0.01442,0.01514,0.01846,0.02921,0.002005,13.31,18.26,84.7,533.7,0.1036,0.085,0.06735,0.0829,0.3101,0.06688
+89122,M,19.4,18.18,127.2,1145,0.1037,0.1442,0.1626,0.09464,0.1893,0.05892,0.4709,0.9951,2.903,53.16,0.005654,0.02199,0.03059,0.01499,0.01623,0.001965,23.79,28.65,152.4,1628,0.1518,0.3749,0.4316,0.2252,0.359,0.07787
+8912280,M,16.24,18.77,108.8,805.1,0.1066,0.1802,0.1948,0.09052,0.1876,0.06684,0.2873,0.9173,2.464,28.09,0.004563,0.03481,0.03872,0.01209,0.01388,0.004081,18.55,25.09,126.9,1031,0.1365,0.4706,0.5026,0.1732,0.277,0.1063
+8912284,B,12.89,15.7,84.08,516.6,0.07818,0.0958,0.1115,0.0339,0.1432,0.05935,0.2913,1.389,2.347,23.29,0.006418,0.03961,0.07927,0.01774,0.01878,0.003696,13.9,19.69,92.12,595.6,0.09926,0.2317,0.3344,0.1017,0.1999,0.07127
+8912521,B,12.58,18.4,79.83,489,0.08393,0.04216,0.00186,0.002924,0.1697,0.05855,0.2719,1.35,1.721,22.45,0.006383,0.008008,0.00186,0.002924,0.02571,0.002015,13.5,23.08,85.56,564.1,0.1038,0.06624,0.005579,0.008772,0.2505,0.06431
+8912909,B,11.94,20.76,77.87,441,0.08605,0.1011,0.06574,0.03791,0.1588,0.06766,0.2742,1.39,3.198,21.91,0.006719,0.05156,0.04387,0.01633,0.01872,0.008015,13.24,27.29,92.2,546.1,0.1116,0.2813,0.2365,0.1155,0.2465,0.09981
+8913,B,12.89,13.12,81.89,515.9,0.06955,0.03729,0.0226,0.01171,0.1337,0.05581,0.1532,0.469,1.115,12.68,0.004731,0.01345,0.01652,0.005905,0.01619,0.002081,13.62,15.54,87.4,577,0.09616,0.1147,0.1186,0.05366,0.2309,0.06915
+8913049,B,11.26,19.96,73.72,394.1,0.0802,0.1181,0.09274,0.05588,0.2595,0.06233,0.4866,1.905,2.877,34.68,0.01574,0.08262,0.08099,0.03487,0.03418,0.006517,11.86,22.33,78.27,437.6,0.1028,0.1843,0.1546,0.09314,0.2955,0.07009
+89143601,B,11.37,18.89,72.17,396,0.08713,0.05008,0.02399,0.02173,0.2013,0.05955,0.2656,1.974,1.954,17.49,0.006538,0.01395,0.01376,0.009924,0.03416,0.002928,12.36,26.14,79.29,459.3,0.1118,0.09708,0.07529,0.06203,0.3267,0.06994
+89143602,B,14.41,19.73,96.03,651,0.08757,0.1676,0.1362,0.06602,0.1714,0.07192,0.8811,1.77,4.36,77.11,0.007762,0.1064,0.0996,0.02771,0.04077,0.02286,15.77,22.13,101.7,767.3,0.09983,0.2472,0.222,0.1021,0.2272,0.08799
+8915,B,14.96,19.1,97.03,687.3,0.08992,0.09823,0.0594,0.04819,0.1879,0.05852,0.2877,0.948,2.171,24.87,0.005332,0.02115,0.01536,0.01187,0.01522,0.002815,16.25,26.19,109.1,809.8,0.1313,0.303,0.1804,0.1489,0.2962,0.08472
+891670,B,12.95,16.02,83.14,513.7,0.1005,0.07943,0.06155,0.0337,0.173,0.0647,0.2094,0.7636,1.231,17.67,0.008725,0.02003,0.02335,0.01132,0.02625,0.004726,13.74,19.93,88.81,585.4,0.1483,0.2068,0.2241,0.1056,0.338,0.09584
+891703,B,11.85,17.46,75.54,432.7,0.08372,0.05642,0.02688,0.0228,0.1875,0.05715,0.207,1.238,1.234,13.88,0.007595,0.015,0.01412,0.008578,0.01792,0.001784,13.06,25.75,84.35,517.8,0.1369,0.1758,0.1316,0.0914,0.3101,0.07007
+891716,B,12.72,13.78,81.78,492.1,0.09667,0.08393,0.01288,0.01924,0.1638,0.061,0.1807,0.6931,1.34,13.38,0.006064,0.0118,0.006564,0.007978,0.01374,0.001392,13.5,17.48,88.54,553.7,0.1298,0.1472,0.05233,0.06343,0.2369,0.06922
+891923,B,13.77,13.27,88.06,582.7,0.09198,0.06221,0.01063,0.01917,0.1592,0.05912,0.2191,0.6946,1.479,17.74,0.004348,0.008153,0.004272,0.006829,0.02154,0.001802,14.67,16.93,94.17,661.1,0.117,0.1072,0.03732,0.05802,0.2823,0.06794
+891936,B,10.91,12.35,69.14,363.7,0.08518,0.04721,0.01236,0.01369,0.1449,0.06031,0.1753,1.027,1.267,11.09,0.003478,0.01221,0.01072,0.009393,0.02941,0.003428,11.37,14.82,72.42,392.2,0.09312,0.07506,0.02884,0.03194,0.2143,0.06643
+892189,M,11.76,18.14,75,431.1,0.09968,0.05914,0.02685,0.03515,0.1619,0.06287,0.645,2.105,4.138,49.11,0.005596,0.01005,0.01272,0.01432,0.01575,0.002758,13.36,23.39,85.1,553.6,0.1137,0.07974,0.0612,0.0716,0.1978,0.06915
+892214,B,14.26,18.17,91.22,633.1,0.06576,0.0522,0.02475,0.01374,0.1635,0.05586,0.23,0.669,1.661,20.56,0.003169,0.01377,0.01079,0.005243,0.01103,0.001957,16.22,25.26,105.8,819.7,0.09445,0.2167,0.1565,0.0753,0.2636,0.07676
+892399,B,10.51,23.09,66.85,334.2,0.1015,0.06797,0.02495,0.01875,0.1695,0.06556,0.2868,1.143,2.289,20.56,0.01017,0.01443,0.01861,0.0125,0.03464,0.001971,10.93,24.22,70.1,362.7,0.1143,0.08614,0.04158,0.03125,0.2227,0.06777
+892438,M,19.53,18.9,129.5,1217,0.115,0.1642,0.2197,0.1062,0.1792,0.06552,1.111,1.161,7.237,133,0.006056,0.03203,0.05638,0.01733,0.01884,0.004787,25.93,26.24,171.1,2053,0.1495,0.4116,0.6121,0.198,0.2968,0.09929
+892604,B,12.46,19.89,80.43,471.3,0.08451,0.1014,0.0683,0.03099,0.1781,0.06249,0.3642,1.04,2.579,28.32,0.00653,0.03369,0.04712,0.01403,0.0274,0.004651,13.46,23.07,88.13,551.3,0.105,0.2158,0.1904,0.07625,0.2685,0.07764
+89263202,M,20.09,23.86,134.7,1247,0.108,0.1838,0.2283,0.128,0.2249,0.07469,1.072,1.743,7.804,130.8,0.007964,0.04732,0.07649,0.01936,0.02736,0.005928,23.68,29.43,158.8,1696,0.1347,0.3391,0.4932,0.1923,0.3294,0.09469
+892657,B,10.49,18.61,66.86,334.3,0.1068,0.06678,0.02297,0.0178,0.1482,0.066,0.1485,1.563,1.035,10.08,0.008875,0.009362,0.01808,0.009199,0.01791,0.003317,11.06,24.54,70.76,375.4,0.1413,0.1044,0.08423,0.06528,0.2213,0.07842
+89296,B,11.46,18.16,73.59,403.1,0.08853,0.07694,0.03344,0.01502,0.1411,0.06243,0.3278,1.059,2.475,22.93,0.006652,0.02652,0.02221,0.007807,0.01894,0.003411,12.68,21.61,82.69,489.8,0.1144,0.1789,0.1226,0.05509,0.2208,0.07638
+893061,B,11.6,24.49,74.23,417.2,0.07474,0.05688,0.01974,0.01313,0.1935,0.05878,0.2512,1.786,1.961,18.21,0.006122,0.02337,0.01596,0.006998,0.03194,0.002211,12.44,31.62,81.39,476.5,0.09545,0.1361,0.07239,0.04815,0.3244,0.06745
+89344,B,13.2,15.82,84.07,537.3,0.08511,0.05251,0.001461,0.003261,0.1632,0.05894,0.1903,0.5735,1.204,15.5,0.003632,0.007861,0.001128,0.002386,0.01344,0.002585,14.41,20.45,92,636.9,0.1128,0.1346,0.0112,0.025,0.2651,0.08385
+89346,B,9,14.4,56.36,246.3,0.07005,0.03116,0.003681,0.003472,0.1788,0.06833,0.1746,1.305,1.144,9.789,0.007389,0.004883,0.003681,0.003472,0.02701,0.002153,9.699,20.07,60.9,285.5,0.09861,0.05232,0.01472,0.01389,0.2991,0.07804
+893526,B,13.5,12.71,85.69,566.2,0.07376,0.03614,0.002758,0.004419,0.1365,0.05335,0.2244,0.6864,1.509,20.39,0.003338,0.003746,0.00203,0.003242,0.0148,0.001566,14.97,16.94,95.48,698.7,0.09023,0.05836,0.01379,0.0221,0.2267,0.06192
+893548,B,13.05,13.84,82.71,530.6,0.08352,0.03735,0.004559,0.008829,0.1453,0.05518,0.3975,0.8285,2.567,33.01,0.004148,0.004711,0.002831,0.004821,0.01422,0.002273,14.73,17.4,93.96,672.4,0.1016,0.05847,0.01824,0.03532,0.2107,0.0658
+893783,B,11.7,19.11,74.33,418.7,0.08814,0.05253,0.01583,0.01148,0.1936,0.06128,0.1601,1.43,1.109,11.28,0.006064,0.00911,0.01042,0.007638,0.02349,0.001661,12.61,26.55,80.92,483.1,0.1223,0.1087,0.07915,0.05741,0.3487,0.06958
+89382601,B,14.61,15.69,92.68,664.9,0.07618,0.03515,0.01447,0.01877,0.1632,0.05255,0.316,0.9115,1.954,28.9,0.005031,0.006021,0.005325,0.006324,0.01494,0.0008948,16.46,21.75,103.7,840.8,0.1011,0.07087,0.04746,0.05813,0.253,0.05695
+89382602,B,12.76,13.37,82.29,504.1,0.08794,0.07948,0.04052,0.02548,0.1601,0.0614,0.3265,0.6594,2.346,25.18,0.006494,0.02768,0.03137,0.01069,0.01731,0.004392,14.19,16.4,92.04,618.8,0.1194,0.2208,0.1769,0.08411,0.2564,0.08253
+893988,B,11.54,10.72,73.73,409.1,0.08597,0.05969,0.01367,0.008907,0.1833,0.061,0.1312,0.3602,1.107,9.438,0.004124,0.0134,0.01003,0.004667,0.02032,0.001952,12.34,12.87,81.23,467.8,0.1092,0.1626,0.08324,0.04715,0.339,0.07434
+894047,B,8.597,18.6,54.09,221.2,0.1074,0.05847,0,0,0.2163,0.07359,0.3368,2.777,2.222,17.81,0.02075,0.01403,0,0,0.06146,0.00682,8.952,22.44,56.65,240.1,0.1347,0.07767,0,0,0.3142,0.08116
+894089,B,12.49,16.85,79.19,481.6,0.08511,0.03834,0.004473,0.006423,0.1215,0.05673,0.1716,0.7151,1.047,12.69,0.004928,0.003012,0.00262,0.00339,0.01393,0.001344,13.34,19.71,84.48,544.2,0.1104,0.04953,0.01938,0.02784,0.1917,0.06174
+894090,B,12.18,14.08,77.25,461.4,0.07734,0.03212,0.01123,0.005051,0.1673,0.05649,0.2113,0.5996,1.438,15.82,0.005343,0.005767,0.01123,0.005051,0.01977,0.0009502,12.85,16.47,81.6,513.1,0.1001,0.05332,0.04116,0.01852,0.2293,0.06037
+894326,M,18.22,18.87,118.7,1027,0.09746,0.1117,0.113,0.0795,0.1807,0.05664,0.4041,0.5503,2.547,48.9,0.004821,0.01659,0.02408,0.01143,0.01275,0.002451,21.84,25,140.9,1485,0.1434,0.2763,0.3853,0.1776,0.2812,0.08198
+894329,B,9.042,18.9,60.07,244.5,0.09968,0.1972,0.1975,0.04908,0.233,0.08743,0.4653,1.911,3.769,24.2,0.009845,0.0659,0.1027,0.02527,0.03491,0.007877,10.06,23.4,68.62,297.1,0.1221,0.3748,0.4609,0.1145,0.3135,0.1055
+894335,B,12.43,17,78.6,477.3,0.07557,0.03454,0.01342,0.01699,0.1472,0.05561,0.3778,2.2,2.487,31.16,0.007357,0.01079,0.009959,0.0112,0.03433,0.002961,12.9,20.21,81.76,515.9,0.08409,0.04712,0.02237,0.02832,0.1901,0.05932
+894604,B,10.25,16.18,66.52,324.2,0.1061,0.1111,0.06726,0.03965,0.1743,0.07279,0.3677,1.471,1.597,22.68,0.01049,0.04265,0.04004,0.01544,0.02719,0.007596,11.28,20.61,71.53,390.4,0.1402,0.236,0.1898,0.09744,0.2608,0.09702
+894618,M,20.16,19.66,131.1,1274,0.0802,0.08564,0.1155,0.07726,0.1928,0.05096,0.5925,0.6863,3.868,74.85,0.004536,0.01376,0.02645,0.01247,0.02193,0.001589,23.06,23.03,150.2,1657,0.1054,0.1537,0.2606,0.1425,0.3055,0.05933
+894855,B,12.86,13.32,82.82,504.8,0.1134,0.08834,0.038,0.034,0.1543,0.06476,0.2212,1.042,1.614,16.57,0.00591,0.02016,0.01902,0.01011,0.01202,0.003107,14.04,21.08,92.8,599.5,0.1547,0.2231,0.1791,0.1155,0.2382,0.08553
+895100,M,20.34,21.51,135.9,1264,0.117,0.1875,0.2565,0.1504,0.2569,0.0667,0.5702,1.023,4.012,69.06,0.005485,0.02431,0.0319,0.01369,0.02768,0.003345,25.3,31.86,171.1,1938,0.1592,0.4492,0.5344,0.2685,0.5558,0.1024
+89511501,B,12.2,15.21,78.01,457.9,0.08673,0.06545,0.01994,0.01692,0.1638,0.06129,0.2575,0.8073,1.959,19.01,0.005403,0.01418,0.01051,0.005142,0.01333,0.002065,13.75,21.38,91.11,583.1,0.1256,0.1928,0.1167,0.05556,0.2661,0.07961
+89511502,B,12.67,17.3,81.25,489.9,0.1028,0.07664,0.03193,0.02107,0.1707,0.05984,0.21,0.9505,1.566,17.61,0.006809,0.009514,0.01329,0.006474,0.02057,0.001784,13.71,21.1,88.7,574.4,0.1384,0.1212,0.102,0.05602,0.2688,0.06888
+89524,B,14.11,12.88,90.03,616.5,0.09309,0.05306,0.01765,0.02733,0.1373,0.057,0.2571,1.081,1.558,23.92,0.006692,0.01132,0.005717,0.006627,0.01416,0.002476,15.53,18,98.4,749.9,0.1281,0.1109,0.05307,0.0589,0.21,0.07083
+895299,B,12.03,17.93,76.09,446,0.07683,0.03892,0.001546,0.005592,0.1382,0.0607,0.2335,0.9097,1.466,16.97,0.004729,0.006887,0.001184,0.003951,0.01466,0.001755,13.07,22.25,82.74,523.4,0.1013,0.0739,0.007732,0.02796,0.2171,0.07037
+8953902,M,16.27,20.71,106.9,813.7,0.1169,0.1319,0.1478,0.08488,0.1948,0.06277,0.4375,1.232,3.27,44.41,0.006697,0.02083,0.03248,0.01392,0.01536,0.002789,19.28,30.38,129.8,1121,0.159,0.2947,0.3597,0.1583,0.3103,0.082
+895633,M,16.26,21.88,107.5,826.8,0.1165,0.1283,0.1799,0.07981,0.1869,0.06532,0.5706,1.457,2.961,57.72,0.01056,0.03756,0.05839,0.01186,0.04022,0.006187,17.73,25.21,113.7,975.2,0.1426,0.2116,0.3344,0.1047,0.2736,0.07953
+896839,M,16.03,15.51,105.8,793.2,0.09491,0.1371,0.1204,0.07041,0.1782,0.05976,0.3371,0.7476,2.629,33.27,0.005839,0.03245,0.03715,0.01459,0.01467,0.003121,18.76,21.98,124.3,1070,0.1435,0.4478,0.4956,0.1981,0.3019,0.09124
+896864,B,12.98,19.35,84.52,514,0.09579,0.1125,0.07107,0.0295,0.1761,0.0654,0.2684,0.5664,2.465,20.65,0.005727,0.03255,0.04393,0.009811,0.02751,0.004572,14.42,21.95,99.21,634.3,0.1288,0.3253,0.3439,0.09858,0.3596,0.09166
+897132,B,11.22,19.86,71.94,387.3,0.1054,0.06779,0.005006,0.007583,0.194,0.06028,0.2976,1.966,1.959,19.62,0.01289,0.01104,0.003297,0.004967,0.04243,0.001963,11.98,25.78,76.91,436.1,0.1424,0.09669,0.01335,0.02022,0.3292,0.06522
+897137,B,11.25,14.78,71.38,390,0.08306,0.04458,0.0009737,0.002941,0.1773,0.06081,0.2144,0.9961,1.529,15.07,0.005617,0.007124,0.0009737,0.002941,0.017,0.00203,12.76,22.06,82.08,492.7,0.1166,0.09794,0.005518,0.01667,0.2815,0.07418
+897374,B,12.3,19.02,77.88,464.4,0.08313,0.04202,0.007756,0.008535,0.1539,0.05945,0.184,1.532,1.199,13.24,0.007881,0.008432,0.007004,0.006522,0.01939,0.002222,13.35,28.46,84.53,544.3,0.1222,0.09052,0.03619,0.03983,0.2554,0.07207
+89742801,M,17.06,21,111.8,918.6,0.1119,0.1056,0.1508,0.09934,0.1727,0.06071,0.8161,2.129,6.076,87.17,0.006455,0.01797,0.04502,0.01744,0.01829,0.003733,20.99,33.15,143.2,1362,0.1449,0.2053,0.392,0.1827,0.2623,0.07599
+897604,B,12.99,14.23,84.08,514.3,0.09462,0.09965,0.03738,0.02098,0.1652,0.07238,0.1814,0.6412,0.9219,14.41,0.005231,0.02305,0.03113,0.007315,0.01639,0.005701,13.72,16.91,87.38,576,0.1142,0.1975,0.145,0.0585,0.2432,0.1009
+897630,M,18.77,21.43,122.9,1092,0.09116,0.1402,0.106,0.0609,0.1953,0.06083,0.6422,1.53,4.369,88.25,0.007548,0.03897,0.03914,0.01816,0.02168,0.004445,24.54,34.37,161.1,1873,0.1498,0.4827,0.4634,0.2048,0.3679,0.0987
+897880,B,10.05,17.53,64.41,310.8,0.1007,0.07326,0.02511,0.01775,0.189,0.06331,0.2619,2.015,1.778,16.85,0.007803,0.01449,0.0169,0.008043,0.021,0.002778,11.16,26.84,71.98,384,0.1402,0.1402,0.1055,0.06499,0.2894,0.07664
+89812,M,23.51,24.27,155.1,1747,0.1069,0.1283,0.2308,0.141,0.1797,0.05506,1.009,0.9245,6.462,164.1,0.006292,0.01971,0.03582,0.01301,0.01479,0.003118,30.67,30.73,202.4,2906,0.1515,0.2678,0.4819,0.2089,0.2593,0.07738
+89813,B,14.42,16.54,94.15,641.2,0.09751,0.1139,0.08007,0.04223,0.1912,0.06412,0.3491,0.7706,2.677,32.14,0.004577,0.03053,0.0384,0.01243,0.01873,0.003373,16.67,21.51,111.4,862.1,0.1294,0.3371,0.3755,0.1414,0.3053,0.08764
+898143,B,9.606,16.84,61.64,280.5,0.08481,0.09228,0.08422,0.02292,0.2036,0.07125,0.1844,0.9429,1.429,12.07,0.005954,0.03471,0.05028,0.00851,0.0175,0.004031,10.75,23.07,71.25,353.6,0.1233,0.3416,0.4341,0.0812,0.2982,0.09825
+89827,B,11.06,14.96,71.49,373.9,0.1033,0.09097,0.05397,0.03341,0.1776,0.06907,0.1601,0.8225,1.355,10.8,0.007416,0.01877,0.02758,0.0101,0.02348,0.002917,11.92,19.9,79.76,440,0.1418,0.221,0.2299,0.1075,0.3301,0.0908
+898431,M,19.68,21.68,129.9,1194,0.09797,0.1339,0.1863,0.1103,0.2082,0.05715,0.6226,2.284,5.173,67.66,0.004756,0.03368,0.04345,0.01806,0.03756,0.003288,22.75,34.66,157.6,1540,0.1218,0.3458,0.4734,0.2255,0.4045,0.07918
+89864002,B,11.71,15.45,75.03,420.3,0.115,0.07281,0.04006,0.0325,0.2009,0.06506,0.3446,0.7395,2.355,24.53,0.009536,0.01097,0.01651,0.01121,0.01953,0.0031,13.06,18.16,84.16,516.4,0.146,0.1115,0.1087,0.07864,0.2765,0.07806
+898677,B,10.26,14.71,66.2,321.6,0.09882,0.09159,0.03581,0.02037,0.1633,0.07005,0.338,2.509,2.394,19.33,0.01736,0.04671,0.02611,0.01296,0.03675,0.006758,10.88,19.48,70.89,357.1,0.136,0.1636,0.07162,0.04074,0.2434,0.08488
+898678,B,12.06,18.9,76.66,445.3,0.08386,0.05794,0.00751,0.008488,0.1555,0.06048,0.243,1.152,1.559,18.02,0.00718,0.01096,0.005832,0.005495,0.01982,0.002754,13.64,27.06,86.54,562.6,0.1289,0.1352,0.04506,0.05093,0.288,0.08083
+89869,B,14.76,14.74,94.87,668.7,0.08875,0.0778,0.04608,0.03528,0.1521,0.05912,0.3428,0.3981,2.537,29.06,0.004732,0.01506,0.01855,0.01067,0.02163,0.002783,17.27,17.93,114.2,880.8,0.122,0.2009,0.2151,0.1251,0.3109,0.08187
+898690,B,11.47,16.03,73.02,402.7,0.09076,0.05886,0.02587,0.02322,0.1634,0.06372,0.1707,0.7615,1.09,12.25,0.009191,0.008548,0.0094,0.006315,0.01755,0.003009,12.51,20.79,79.67,475.8,0.1531,0.112,0.09823,0.06548,0.2851,0.08763
+899147,B,11.95,14.96,77.23,426.7,0.1158,0.1206,0.01171,0.01787,0.2459,0.06581,0.361,1.05,2.455,26.65,0.0058,0.02417,0.007816,0.01052,0.02734,0.003114,12.81,17.72,83.09,496.2,0.1293,0.1885,0.03122,0.04766,0.3124,0.0759
+899187,B,11.66,17.07,73.7,421,0.07561,0.0363,0.008306,0.01162,0.1671,0.05731,0.3534,0.6724,2.225,26.03,0.006583,0.006991,0.005949,0.006296,0.02216,0.002668,13.28,19.74,83.61,542.5,0.09958,0.06476,0.03046,0.04262,0.2731,0.06825
+899667,M,15.75,19.22,107.1,758.6,0.1243,0.2364,0.2914,0.1242,0.2375,0.07603,0.5204,1.324,3.477,51.22,0.009329,0.06559,0.09953,0.02283,0.05543,0.00733,17.36,24.17,119.4,915.3,0.155,0.5046,0.6872,0.2135,0.4245,0.105
+899987,M,25.73,17.46,174.2,2010,0.1149,0.2363,0.3368,0.1913,0.1956,0.06121,0.9948,0.8509,7.222,153.1,0.006369,0.04243,0.04266,0.01508,0.02335,0.003385,33.13,23.58,229.3,3234,0.153,0.5937,0.6451,0.2756,0.369,0.08815
+9010018,M,15.08,25.74,98,716.6,0.1024,0.09769,0.1235,0.06553,0.1647,0.06464,0.6534,1.506,4.174,63.37,0.01052,0.02431,0.04912,0.01746,0.0212,0.004867,18.51,33.22,121.2,1050,0.166,0.2356,0.4029,0.1526,0.2654,0.09438
+901011,B,11.14,14.07,71.24,384.6,0.07274,0.06064,0.04505,0.01471,0.169,0.06083,0.4222,0.8092,3.33,28.84,0.005541,0.03387,0.04505,0.01471,0.03102,0.004831,12.12,15.82,79.62,453.5,0.08864,0.1256,0.1201,0.03922,0.2576,0.07018
+9010258,B,12.56,19.07,81.92,485.8,0.0876,0.1038,0.103,0.04391,0.1533,0.06184,0.3602,1.478,3.212,27.49,0.009853,0.04235,0.06271,0.01966,0.02639,0.004205,13.37,22.43,89.02,547.4,0.1096,0.2002,0.2388,0.09265,0.2121,0.07188
+9010259,B,13.05,18.59,85.09,512,0.1082,0.1304,0.09603,0.05603,0.2035,0.06501,0.3106,1.51,2.59,21.57,0.007807,0.03932,0.05112,0.01876,0.0286,0.005715,14.19,24.85,94.22,591.2,0.1343,0.2658,0.2573,0.1258,0.3113,0.08317
+901028,B,13.87,16.21,88.52,593.7,0.08743,0.05492,0.01502,0.02088,0.1424,0.05883,0.2543,1.363,1.737,20.74,0.005638,0.007939,0.005254,0.006042,0.01544,0.002087,15.11,25.58,96.74,694.4,0.1153,0.1008,0.05285,0.05556,0.2362,0.07113
+9010333,B,8.878,15.49,56.74,241,0.08293,0.07698,0.04721,0.02381,0.193,0.06621,0.5381,1.2,4.277,30.18,0.01093,0.02899,0.03214,0.01506,0.02837,0.004174,9.981,17.7,65.27,302,0.1015,0.1248,0.09441,0.04762,0.2434,0.07431
+901034301,B,9.436,18.32,59.82,278.6,0.1009,0.05956,0.0271,0.01406,0.1506,0.06959,0.5079,1.247,3.267,30.48,0.006836,0.008982,0.02348,0.006565,0.01942,0.002713,12.02,25.02,75.79,439.6,0.1333,0.1049,0.1144,0.05052,0.2454,0.08136
+901034302,B,12.54,18.07,79.42,491.9,0.07436,0.0265,0.001194,0.005449,0.1528,0.05185,0.3511,0.9527,2.329,28.3,0.005783,0.004693,0.0007929,0.003617,0.02043,0.001058,13.72,20.98,86.82,585.7,0.09293,0.04327,0.003581,0.01635,0.2233,0.05521
+901041,B,13.3,21.57,85.24,546.1,0.08582,0.06373,0.03344,0.02424,0.1815,0.05696,0.2621,1.539,2.028,20.98,0.005498,0.02045,0.01795,0.006399,0.01829,0.001956,14.2,29.2,92.94,621.2,0.114,0.1667,0.1212,0.05614,0.2637,0.06658
+9010598,B,12.76,18.84,81.87,496.6,0.09676,0.07952,0.02688,0.01781,0.1759,0.06183,0.2213,1.285,1.535,17.26,0.005608,0.01646,0.01529,0.009997,0.01909,0.002133,13.75,25.99,87.82,579.7,0.1298,0.1839,0.1255,0.08312,0.2744,0.07238
+9010872,B,16.5,18.29,106.6,838.1,0.09686,0.08468,0.05862,0.04835,0.1495,0.05593,0.3389,1.439,2.344,33.58,0.007257,0.01805,0.01832,0.01033,0.01694,0.002001,18.13,25.45,117.2,1009,0.1338,0.1679,0.1663,0.09123,0.2394,0.06469
+9010877,B,13.4,16.95,85.48,552.4,0.07937,0.05696,0.02181,0.01473,0.165,0.05701,0.1584,0.6124,1.036,13.22,0.004394,0.0125,0.01451,0.005484,0.01291,0.002074,14.73,21.7,93.76,663.5,0.1213,0.1676,0.1364,0.06987,0.2741,0.07582
+901088,M,20.44,21.78,133.8,1293,0.0915,0.1131,0.09799,0.07785,0.1618,0.05557,0.5781,0.9168,4.218,72.44,0.006208,0.01906,0.02375,0.01461,0.01445,0.001906,24.31,26.37,161.2,1780,0.1327,0.2376,0.2702,0.1765,0.2609,0.06735
+9011494,M,20.2,26.83,133.7,1234,0.09905,0.1669,0.1641,0.1265,0.1875,0.0602,0.9761,1.892,7.128,103.6,0.008439,0.04674,0.05904,0.02536,0.0371,0.004286,24.19,33.81,160,1671,0.1278,0.3416,0.3703,0.2152,0.3271,0.07632
+9011495,B,12.21,18.02,78.31,458.4,0.09231,0.07175,0.04392,0.02027,0.1695,0.05916,0.2527,0.7786,1.874,18.57,0.005833,0.01388,0.02,0.007087,0.01938,0.00196,14.29,24.04,93.85,624.6,0.1368,0.217,0.2413,0.08829,0.3218,0.0747
+9011971,M,21.71,17.25,140.9,1546,0.09384,0.08562,0.1168,0.08465,0.1717,0.05054,1.207,1.051,7.733,224.1,0.005568,0.01112,0.02096,0.01197,0.01263,0.001803,30.75,26.44,199.5,3143,0.1363,0.1628,0.2861,0.182,0.251,0.06494
+9012000,M,22.01,21.9,147.2,1482,0.1063,0.1954,0.2448,0.1501,0.1824,0.0614,1.008,0.6999,7.561,130.2,0.003978,0.02821,0.03576,0.01471,0.01518,0.003796,27.66,25.8,195,2227,0.1294,0.3885,0.4756,0.2432,0.2741,0.08574
+9012315,M,16.35,23.29,109,840.4,0.09742,0.1497,0.1811,0.08773,0.2175,0.06218,0.4312,1.022,2.972,45.5,0.005635,0.03917,0.06072,0.01656,0.03197,0.004085,19.38,31.03,129.3,1165,0.1415,0.4665,0.7087,0.2248,0.4824,0.09614
+9012568,B,15.19,13.21,97.65,711.8,0.07963,0.06934,0.03393,0.02657,0.1721,0.05544,0.1783,0.4125,1.338,17.72,0.005012,0.01485,0.01551,0.009155,0.01647,0.001767,16.2,15.73,104.5,819.1,0.1126,0.1737,0.1362,0.08178,0.2487,0.06766
+9012795,M,21.37,15.1,141.3,1386,0.1001,0.1515,0.1932,0.1255,0.1973,0.06183,0.3414,1.309,2.407,39.06,0.004426,0.02675,0.03437,0.01343,0.01675,0.004367,22.69,21.84,152.1,1535,0.1192,0.284,0.4024,0.1966,0.273,0.08666
+901288,M,20.64,17.35,134.8,1335,0.09446,0.1076,0.1527,0.08941,0.1571,0.05478,0.6137,0.6575,4.119,77.02,0.006211,0.01895,0.02681,0.01232,0.01276,0.001711,25.37,23.17,166.8,1946,0.1562,0.3055,0.4159,0.2112,0.2689,0.07055
+9013005,B,13.69,16.07,87.84,579.1,0.08302,0.06374,0.02556,0.02031,0.1872,0.05669,0.1705,0.5066,1.372,14,0.00423,0.01587,0.01169,0.006335,0.01943,0.002177,14.84,20.21,99.16,670.6,0.1105,0.2096,0.1346,0.06987,0.3323,0.07701
+901303,B,16.17,16.07,106.3,788.5,0.0988,0.1438,0.06651,0.05397,0.199,0.06572,0.1745,0.489,1.349,14.91,0.00451,0.01812,0.01951,0.01196,0.01934,0.003696,16.97,19.14,113.1,861.5,0.1235,0.255,0.2114,0.1251,0.3153,0.0896
+901315,B,10.57,20.22,70.15,338.3,0.09073,0.166,0.228,0.05941,0.2188,0.0845,0.1115,1.231,2.363,7.228,0.008499,0.07643,0.1535,0.02919,0.01617,0.0122,10.85,22.82,76.51,351.9,0.1143,0.3619,0.603,0.1465,0.2597,0.12
+9013579,B,13.46,28.21,85.89,562.1,0.07517,0.04726,0.01271,0.01117,0.1421,0.05763,0.1689,1.15,1.4,14.91,0.004942,0.01203,0.007508,0.005179,0.01442,0.001684,14.69,35.63,97.11,680.6,0.1108,0.1457,0.07934,0.05781,0.2694,0.07061
+9013594,B,13.66,15.15,88.27,580.6,0.08268,0.07548,0.04249,0.02471,0.1792,0.05897,0.1402,0.5417,1.101,11.35,0.005212,0.02984,0.02443,0.008356,0.01818,0.004868,14.54,19.64,97.96,657,0.1275,0.3104,0.2569,0.1054,0.3387,0.09638
+9013838,M,11.08,18.83,73.3,361.6,0.1216,0.2154,0.1689,0.06367,0.2196,0.0795,0.2114,1.027,1.719,13.99,0.007405,0.04549,0.04588,0.01339,0.01738,0.004435,13.24,32.82,91.76,508.1,0.2184,0.9379,0.8402,0.2524,0.4154,0.1403
+901549,B,11.27,12.96,73.16,386.3,0.1237,0.1111,0.079,0.0555,0.2018,0.06914,0.2562,0.9858,1.809,16.04,0.006635,0.01777,0.02101,0.01164,0.02108,0.003721,12.84,20.53,84.93,476.1,0.161,0.2429,0.2247,0.1318,0.3343,0.09215
+901836,B,11.04,14.93,70.67,372.7,0.07987,0.07079,0.03546,0.02074,0.2003,0.06246,0.1642,1.031,1.281,11.68,0.005296,0.01903,0.01723,0.00696,0.0188,0.001941,12.09,20.83,79.73,447.1,0.1095,0.1982,0.1553,0.06754,0.3202,0.07287
+90250,B,12.05,22.72,78.75,447.8,0.06935,0.1073,0.07943,0.02978,0.1203,0.06659,0.1194,1.434,1.778,9.549,0.005042,0.0456,0.04305,0.01667,0.0247,0.007358,12.57,28.71,87.36,488.4,0.08799,0.3214,0.2912,0.1092,0.2191,0.09349
+90251,B,12.39,17.48,80.64,462.9,0.1042,0.1297,0.05892,0.0288,0.1779,0.06588,0.2608,0.873,2.117,19.2,0.006715,0.03705,0.04757,0.01051,0.01838,0.006884,14.18,23.13,95.23,600.5,0.1427,0.3593,0.3206,0.09804,0.2819,0.1118
+902727,B,13.28,13.72,85.79,541.8,0.08363,0.08575,0.05077,0.02864,0.1617,0.05594,0.1833,0.5308,1.592,15.26,0.004271,0.02073,0.02828,0.008468,0.01461,0.002613,14.24,17.37,96.59,623.7,0.1166,0.2685,0.2866,0.09173,0.2736,0.0732
+90291,M,14.6,23.29,93.97,664.7,0.08682,0.06636,0.0839,0.05271,0.1627,0.05416,0.4157,1.627,2.914,33.01,0.008312,0.01742,0.03389,0.01576,0.0174,0.002871,15.79,31.71,102.2,758.2,0.1312,0.1581,0.2675,0.1359,0.2477,0.06836
+902975,B,12.21,14.09,78.78,462,0.08108,0.07823,0.06839,0.02534,0.1646,0.06154,0.2666,0.8309,2.097,19.96,0.004405,0.03026,0.04344,0.01087,0.01921,0.004622,13.13,19.29,87.65,529.9,0.1026,0.2431,0.3076,0.0914,0.2677,0.08824
+902976,B,13.88,16.16,88.37,596.6,0.07026,0.04831,0.02045,0.008507,0.1607,0.05474,0.2541,0.6218,1.709,23.12,0.003728,0.01415,0.01988,0.007016,0.01647,0.00197,15.51,19.97,99.66,745.3,0.08484,0.1233,0.1091,0.04537,0.2542,0.06623
+903011,B,11.27,15.5,73.38,392,0.08365,0.1114,0.1007,0.02757,0.181,0.07252,0.3305,1.067,2.569,22.97,0.01038,0.06669,0.09472,0.02047,0.01219,0.01233,12.04,18.93,79.73,450,0.1102,0.2809,0.3021,0.08272,0.2157,0.1043
+90312,M,19.55,23.21,128.9,1174,0.101,0.1318,0.1856,0.1021,0.1989,0.05884,0.6107,2.836,5.383,70.1,0.01124,0.04097,0.07469,0.03441,0.02768,0.00624,20.82,30.44,142,1313,0.1251,0.2414,0.3829,0.1825,0.2576,0.07602
+90317302,B,10.26,12.22,65.75,321.6,0.09996,0.07542,0.01923,0.01968,0.18,0.06569,0.1911,0.5477,1.348,11.88,0.005682,0.01365,0.008496,0.006929,0.01938,0.002371,11.38,15.65,73.23,394.5,0.1343,0.165,0.08615,0.06696,0.2937,0.07722
+903483,B,8.734,16.84,55.27,234.3,0.1039,0.07428,0,0,0.1985,0.07098,0.5169,2.079,3.167,28.85,0.01582,0.01966,0,0,0.01865,0.006736,10.17,22.8,64.01,317,0.146,0.131,0,0,0.2445,0.08865
+903507,M,15.49,19.97,102.4,744.7,0.116,0.1562,0.1891,0.09113,0.1929,0.06744,0.647,1.331,4.675,66.91,0.007269,0.02928,0.04972,0.01639,0.01852,0.004232,21.2,29.41,142.1,1359,0.1681,0.3913,0.5553,0.2121,0.3187,0.1019
+903516,M,21.61,22.28,144.4,1407,0.1167,0.2087,0.281,0.1562,0.2162,0.06606,0.6242,0.9209,4.158,80.99,0.005215,0.03726,0.04718,0.01288,0.02045,0.004028,26.23,28.74,172,2081,0.1502,0.5717,0.7053,0.2422,0.3828,0.1007
+903554,B,12.1,17.72,78.07,446.2,0.1029,0.09758,0.04783,0.03326,0.1937,0.06161,0.2841,1.652,1.869,22.22,0.008146,0.01631,0.01843,0.007513,0.02015,0.001798,13.56,25.8,88.33,559.5,0.1432,0.1773,0.1603,0.06266,0.3049,0.07081
+903811,B,14.06,17.18,89.75,609.1,0.08045,0.05361,0.02681,0.03251,0.1641,0.05764,0.1504,1.685,1.237,12.67,0.005371,0.01273,0.01132,0.009155,0.01719,0.001444,14.92,25.34,96.42,684.5,0.1066,0.1231,0.0846,0.07911,0.2523,0.06609
+90401601,B,13.51,18.89,88.1,558.1,0.1059,0.1147,0.0858,0.05381,0.1806,0.06079,0.2136,1.332,1.513,19.29,0.005442,0.01957,0.03304,0.01367,0.01315,0.002464,14.8,27.2,97.33,675.2,0.1428,0.257,0.3438,0.1453,0.2666,0.07686
+90401602,B,12.8,17.46,83.05,508.3,0.08044,0.08895,0.0739,0.04083,0.1574,0.0575,0.3639,1.265,2.668,30.57,0.005421,0.03477,0.04545,0.01384,0.01869,0.004067,13.74,21.06,90.72,591,0.09534,0.1812,0.1901,0.08296,0.1988,0.07053
+904302,B,11.06,14.83,70.31,378.2,0.07741,0.04768,0.02712,0.007246,0.1535,0.06214,0.1855,0.6881,1.263,12.98,0.004259,0.01469,0.0194,0.004168,0.01191,0.003537,12.68,20.35,80.79,496.7,0.112,0.1879,0.2079,0.05556,0.259,0.09158
+904357,B,11.8,17.26,75.26,431.9,0.09087,0.06232,0.02853,0.01638,0.1847,0.06019,0.3438,1.14,2.225,25.06,0.005463,0.01964,0.02079,0.005398,0.01477,0.003071,13.45,24.49,86,562,0.1244,0.1726,0.1449,0.05356,0.2779,0.08121
+90439701,M,17.91,21.02,124.4,994,0.123,0.2576,0.3189,0.1198,0.2113,0.07115,0.403,0.7747,3.123,41.51,0.007159,0.03718,0.06165,0.01051,0.01591,0.005099,20.8,27.78,149.6,1304,0.1873,0.5917,0.9034,0.1964,0.3245,0.1198
+904647,B,11.93,10.91,76.14,442.7,0.08872,0.05242,0.02606,0.01796,0.1601,0.05541,0.2522,1.045,1.649,18.95,0.006175,0.01204,0.01376,0.005832,0.01096,0.001857,13.8,20.14,87.64,589.5,0.1374,0.1575,0.1514,0.06876,0.246,0.07262
+904689,B,12.96,18.29,84.18,525.2,0.07351,0.07899,0.04057,0.01883,0.1874,0.05899,0.2357,1.299,2.397,20.21,0.003629,0.03713,0.03452,0.01065,0.02632,0.003705,14.13,24.61,96.31,621.9,0.09329,0.2318,0.1604,0.06608,0.3207,0.07247
+9047,B,12.94,16.17,83.18,507.6,0.09879,0.08836,0.03296,0.0239,0.1735,0.062,0.1458,0.905,0.9975,11.36,0.002887,0.01285,0.01613,0.007308,0.0187,0.001972,13.86,23.02,89.69,580.9,0.1172,0.1958,0.181,0.08388,0.3297,0.07834
+904969,B,12.34,14.95,78.29,469.1,0.08682,0.04571,0.02109,0.02054,0.1571,0.05708,0.3833,0.9078,2.602,30.15,0.007702,0.008491,0.01307,0.0103,0.0297,0.001432,13.18,16.85,84.11,533.1,0.1048,0.06744,0.04921,0.04793,0.2298,0.05974
+904971,B,10.94,18.59,70.39,370,0.1004,0.0746,0.04944,0.02932,0.1486,0.06615,0.3796,1.743,3.018,25.78,0.009519,0.02134,0.0199,0.01155,0.02079,0.002701,12.4,25.58,82.76,472.4,0.1363,0.1644,0.1412,0.07887,0.2251,0.07732
+905189,B,16.14,14.86,104.3,80

<TRUNCATED>
r***@apache.org
2018-06-28 14:54:45 UTC
Permalink
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/PearsonCorrelationSimilarity.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/PearsonCorrelationSimilarity.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/PearsonCorrelationSimilarity.java
new file mode 100644
index 0000000..8ea1660
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/PearsonCorrelationSimilarity.java
@@ -0,0 +1,93 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.similarity;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.common.Weighting;
+import org.apache.mahout.cf.taste.model.DataModel;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * <p>
+ * An implementation of the Pearson correlation. For users X and Y, the following values are calculated:
+ * </p>
+ *
+ * <ul>
+ * <li>sumX2: sum of the square of all X's preference values</li>
+ * <li>sumY2: sum of the square of all Y's preference values</li>
+ * <li>sumXY: sum of the product of X and Y's preference value for all items for which both X and Y express a
+ * preference</li>
+ * </ul>
+ *
+ * <p>
+ * The correlation is then:
+ *
+ * <p>
+ * {@code sumXY / sqrt(sumX2 * sumY2)}
+ * </p>
+ *
+ * <p>
+ * Note that this correlation "centers" its data, shifts the user's preference values so that each of their
+ * means is 0. This is necessary to achieve expected behavior on all data sets.
+ * </p>
+ *
+ * <p>
+ * This correlation implementation is equivalent to the cosine similarity since the data it receives
+ * is assumed to be centered -- mean is 0. The correlation may be interpreted as the cosine of the angle
+ * between the two vectors defined by the users' preference values.
+ * </p>
+ *
+ * <p>
+ * For cosine similarity on uncentered data, see {@link UncenteredCosineSimilarity}.
+ * </p>
+ */
+public final class PearsonCorrelationSimilarity extends AbstractSimilarity {
+
+ /**
+ * @throws IllegalArgumentException if {@link DataModel} does not have preference values
+ */
+ public PearsonCorrelationSimilarity(DataModel dataModel) throws TasteException {
+ this(dataModel, Weighting.UNWEIGHTED);
+ }
+
+ /**
+ * @throws IllegalArgumentException if {@link DataModel} does not have preference values
+ */
+ public PearsonCorrelationSimilarity(DataModel dataModel, Weighting weighting) throws TasteException {
+ super(dataModel, weighting, true);
+ Preconditions.checkArgument(dataModel.hasPreferenceValues(), "DataModel doesn't have preference values");
+ }
+
+ @Override
+ double computeResult(int n, double sumXY, double sumX2, double sumY2, double sumXYdiff2) {
+ if (n == 0) {
+ return Double.NaN;
+ }
+ // Note that sum of X and sum of Y don't appear here since they are assumed to be 0;
+ // the data is assumed to be centered.
+ double denominator = Math.sqrt(sumX2) * Math.sqrt(sumY2);
+ if (denominator == 0.0) {
+ // One or both parties has -all- the same ratings;
+ // can't really say much similarity under this measure
+ return Double.NaN;
+ }
+ return sumXY / denominator;
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/SpearmanCorrelationSimilarity.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/SpearmanCorrelationSimilarity.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/SpearmanCorrelationSimilarity.java
new file mode 100644
index 0000000..1116368
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/SpearmanCorrelationSimilarity.java
@@ -0,0 +1,135 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.similarity;
+
+import java.util.Collection;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.RefreshHelper;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.cf.taste.similarity.PreferenceInferrer;
+import org.apache.mahout.cf.taste.similarity.UserSimilarity;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * <p>
+ * Like {@link PearsonCorrelationSimilarity}, but compares relative ranking of preference values instead of
+ * preference values themselves. That is, each user's preferences are sorted and then assign a rank as their
+ * preference value, with 1 being assigned to the least preferred item.
+ * </p>
+ */
+public final class SpearmanCorrelationSimilarity implements UserSimilarity {
+
+ private final DataModel dataModel;
+
+ public SpearmanCorrelationSimilarity(DataModel dataModel) {
+ this.dataModel = Preconditions.checkNotNull(dataModel);
+ }
+
+ @Override
+ public double userSimilarity(long userID1, long userID2) throws TasteException {
+ PreferenceArray xPrefs = dataModel.getPreferencesFromUser(userID1);
+ PreferenceArray yPrefs = dataModel.getPreferencesFromUser(userID2);
+ int xLength = xPrefs.length();
+ int yLength = yPrefs.length();
+
+ if (xLength <= 1 || yLength <= 1) {
+ return Double.NaN;
+ }
+
+ // Copy prefs since we need to modify pref values to ranks
+ xPrefs = xPrefs.clone();
+ yPrefs = yPrefs.clone();
+
+ // First sort by values from low to high
+ xPrefs.sortByValue();
+ yPrefs.sortByValue();
+
+ // Assign ranks from low to high
+ float nextRank = 1.0f;
+ for (int i = 0; i < xLength; i++) {
+ // ... but only for items that are common to both pref arrays
+ if (yPrefs.hasPrefWithItemID(xPrefs.getItemID(i))) {
+ xPrefs.setValue(i, nextRank);
+ nextRank += 1.0f;
+ }
+ // Other values are bogus but don't matter
+ }
+ nextRank = 1.0f;
+ for (int i = 0; i < yLength; i++) {
+ if (xPrefs.hasPrefWithItemID(yPrefs.getItemID(i))) {
+ yPrefs.setValue(i, nextRank);
+ nextRank += 1.0f;
+ }
+ }
+
+ xPrefs.sortByItem();
+ yPrefs.sortByItem();
+
+ long xIndex = xPrefs.getItemID(0);
+ long yIndex = yPrefs.getItemID(0);
+ int xPrefIndex = 0;
+ int yPrefIndex = 0;
+
+ double sumXYRankDiff2 = 0.0;
+ int count = 0;
+
+ while (true) {
+ int compare = xIndex < yIndex ? -1 : xIndex > yIndex ? 1 : 0;
+ if (compare == 0) {
+ double diff = xPrefs.getValue(xPrefIndex) - yPrefs.getValue(yPrefIndex);
+ sumXYRankDiff2 += diff * diff;
+ count++;
+ }
+ if (compare <= 0) {
+ if (++xPrefIndex >= xLength) {
+ break;
+ }
+ xIndex = xPrefs.getItemID(xPrefIndex);
+ }
+ if (compare >= 0) {
+ if (++yPrefIndex >= yLength) {
+ break;
+ }
+ yIndex = yPrefs.getItemID(yPrefIndex);
+ }
+ }
+
+ if (count <= 1) {
+ return Double.NaN;
+ }
+
+ // When ranks are unique, this formula actually gives the Pearson correlation
+ return 1.0 - 6.0 * sumXYRankDiff2 / (count * (count * count - 1));
+ }
+
+ @Override
+ public void setPreferenceInferrer(PreferenceInferrer inferrer) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public void refresh(Collection<Refreshable> alreadyRefreshed) {
+ alreadyRefreshed = RefreshHelper.buildRefreshed(alreadyRefreshed);
+ RefreshHelper.maybeRefresh(alreadyRefreshed, dataModel);
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/TanimotoCoefficientSimilarity.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/TanimotoCoefficientSimilarity.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/TanimotoCoefficientSimilarity.java
new file mode 100644
index 0000000..0c3a0a4
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/TanimotoCoefficientSimilarity.java
@@ -0,0 +1,126 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.similarity;
+
+import java.util.Collection;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.impl.common.RefreshHelper;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.similarity.PreferenceInferrer;
+import org.apache.mahout.cf.taste.similarity.UserSimilarity;
+
+/**
+ * <p>
+ * An implementation of a "similarity" based on the <a
+ * href="http://en.wikipedia.org/wiki/Jaccard_index#Tanimoto_coefficient_.28extended_Jaccard_coefficient.29">
+ * Tanimoto coefficient</a>, or extended <a href="http://en.wikipedia.org/wiki/Jaccard_index">Jaccard
+ * coefficient</a>.
+ * </p>
+ *
+ * <p>
+ * This is intended for "binary" data sets where a user either expresses a generic "yes" preference for an
+ * item or has no preference. The actual preference values do not matter here, only their presence or absence.
+ * </p>
+ *
+ * <p>
+ * The value returned is in [0,1].
+ * </p>
+ */
+public final class TanimotoCoefficientSimilarity extends AbstractItemSimilarity implements UserSimilarity {
+
+ public TanimotoCoefficientSimilarity(DataModel dataModel) {
+ super(dataModel);
+ }
+
+ /**
+ * @throws UnsupportedOperationException
+ */
+ @Override
+ public void setPreferenceInferrer(PreferenceInferrer inferrer) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public double userSimilarity(long userID1, long userID2) throws TasteException {
+
+ DataModel dataModel = getDataModel();
+ FastIDSet xPrefs = dataModel.getItemIDsFromUser(userID1);
+ FastIDSet yPrefs = dataModel.getItemIDsFromUser(userID2);
+
+ int xPrefsSize = xPrefs.size();
+ int yPrefsSize = yPrefs.size();
+ if (xPrefsSize == 0 && yPrefsSize == 0) {
+ return Double.NaN;
+ }
+ if (xPrefsSize == 0 || yPrefsSize == 0) {
+ return 0.0;
+ }
+
+ int intersectionSize =
+ xPrefsSize < yPrefsSize ? yPrefs.intersectionSize(xPrefs) : xPrefs.intersectionSize(yPrefs);
+ if (intersectionSize == 0) {
+ return Double.NaN;
+ }
+
+ int unionSize = xPrefsSize + yPrefsSize - intersectionSize;
+
+ return (double) intersectionSize / (double) unionSize;
+ }
+
+ @Override
+ public double itemSimilarity(long itemID1, long itemID2) throws TasteException {
+ int preferring1 = getDataModel().getNumUsersWithPreferenceFor(itemID1);
+ return doItemSimilarity(itemID1, itemID2, preferring1);
+ }
+
+ @Override
+ public double[] itemSimilarities(long itemID1, long[] itemID2s) throws TasteException {
+ int preferring1 = getDataModel().getNumUsersWithPreferenceFor(itemID1);
+ int length = itemID2s.length;
+ double[] result = new double[length];
+ for (int i = 0; i < length; i++) {
+ result[i] = doItemSimilarity(itemID1, itemID2s[i], preferring1);
+ }
+ return result;
+ }
+
+ private double doItemSimilarity(long itemID1, long itemID2, int preferring1) throws TasteException {
+ DataModel dataModel = getDataModel();
+ int preferring1and2 = dataModel.getNumUsersWithPreferenceFor(itemID1, itemID2);
+ if (preferring1and2 == 0) {
+ return Double.NaN;
+ }
+ int preferring2 = dataModel.getNumUsersWithPreferenceFor(itemID2);
+ return (double) preferring1and2 / (double) (preferring1 + preferring2 - preferring1and2);
+ }
+
+ @Override
+ public void refresh(Collection<Refreshable> alreadyRefreshed) {
+ alreadyRefreshed = RefreshHelper.buildRefreshed(alreadyRefreshed);
+ RefreshHelper.maybeRefresh(alreadyRefreshed, getDataModel());
+ }
+
+ @Override
+ public String toString() {
+ return "TanimotoCoefficientSimilarity[dataModel:" + getDataModel() + ']';
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/UncenteredCosineSimilarity.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/UncenteredCosineSimilarity.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/UncenteredCosineSimilarity.java
new file mode 100644
index 0000000..6260606
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/UncenteredCosineSimilarity.java
@@ -0,0 +1,69 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.similarity;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.common.Weighting;
+import org.apache.mahout.cf.taste.model.DataModel;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * <p>
+ * An implementation of the cosine similarity. The result is the cosine of the angle formed between
+ * the two preference vectors.
+ * </p>
+ *
+ * <p>
+ * Note that this similarity does not "center" its data, shifts the user's preference values so that each of their
+ * means is 0. For this behavior, use {@link PearsonCorrelationSimilarity}, which actually is mathematically
+ * equivalent for centered data.
+ * </p>
+ */
+public final class UncenteredCosineSimilarity extends AbstractSimilarity {
+
+ /**
+ * @throws IllegalArgumentException if {@link DataModel} does not have preference values
+ */
+ public UncenteredCosineSimilarity(DataModel dataModel) throws TasteException {
+ this(dataModel, Weighting.UNWEIGHTED);
+ }
+
+ /**
+ * @throws IllegalArgumentException if {@link DataModel} does not have preference values
+ */
+ public UncenteredCosineSimilarity(DataModel dataModel, Weighting weighting) throws TasteException {
+ super(dataModel, weighting, false);
+ Preconditions.checkArgument(dataModel.hasPreferenceValues(), "DataModel doesn't have preference values");
+ }
+
+ @Override
+ double computeResult(int n, double sumXY, double sumX2, double sumY2, double sumXYdiff2) {
+ if (n == 0) {
+ return Double.NaN;
+ }
+ double denominator = Math.sqrt(sumX2) * Math.sqrt(sumY2);
+ if (denominator == 0.0) {
+ // One or both parties has -all- the same ratings;
+ // can't really say much similarity under this measure
+ return Double.NaN;
+ }
+ return sumXY / denominator;
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/file/FileItemItemSimilarityIterable.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/file/FileItemItemSimilarityIterable.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/file/FileItemItemSimilarityIterable.java
new file mode 100644
index 0000000..1ae45c2
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/file/FileItemItemSimilarityIterable.java
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.similarity.file;
+
+import org.apache.mahout.cf.taste.impl.similarity.GenericItemSimilarity;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Iterator;
+
+/**
+ * {@link Iterable} to be able to read a file linewise into a {@link GenericItemSimilarity}
+ */
+final class FileItemItemSimilarityIterable implements Iterable<GenericItemSimilarity.ItemItemSimilarity> {
+
+ private final File similaritiesFile;
+
+ FileItemItemSimilarityIterable(File similaritiesFile) {
+ this.similaritiesFile = similaritiesFile;
+ }
+
+ @Override
+ public Iterator<GenericItemSimilarity.ItemItemSimilarity> iterator() {
+ try {
+ return new FileItemItemSimilarityIterator(similaritiesFile);
+ } catch (IOException ioe) {
+ throw new IllegalStateException("Can't read " + similaritiesFile, ioe);
+ }
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/file/FileItemItemSimilarityIterator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/file/FileItemItemSimilarityIterator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/file/FileItemItemSimilarityIterator.java
new file mode 100644
index 0000000..c071159
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/file/FileItemItemSimilarityIterator.java
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.similarity.file;
+
+import com.google.common.base.Function;
+import com.google.common.collect.ForwardingIterator;
+import com.google.common.collect.Iterators;
+import org.apache.mahout.cf.taste.impl.similarity.GenericItemSimilarity;
+import org.apache.mahout.common.iterator.FileLineIterator;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.regex.Pattern;
+
+/**
+ * a simple iterator using a {@link FileLineIterator} internally, parsing each
+ * line into an {@link GenericItemSimilarity.ItemItemSimilarity}.
+ */
+final class FileItemItemSimilarityIterator extends ForwardingIterator<GenericItemSimilarity.ItemItemSimilarity> {
+
+ private static final Pattern SEPARATOR = Pattern.compile("[,\t]");
+
+ private final Iterator<GenericItemSimilarity.ItemItemSimilarity> delegate;
+
+ FileItemItemSimilarityIterator(File similaritiesFile) throws IOException {
+ delegate = Iterators.transform(
+ new FileLineIterator(similaritiesFile),
+ new Function<String, GenericItemSimilarity.ItemItemSimilarity>() {
+ @Override
+ public GenericItemSimilarity.ItemItemSimilarity apply(String from) {
+ String[] tokens = SEPARATOR.split(from);
+ return new GenericItemSimilarity.ItemItemSimilarity(Long.parseLong(tokens[0]),
+ Long.parseLong(tokens[1]),
+ Double.parseDouble(tokens[2]));
+ }
+ });
+ }
+
+ @Override
+ protected Iterator<GenericItemSimilarity.ItemItemSimilarity> delegate() {
+ return delegate;
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/file/FileItemSimilarity.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/file/FileItemSimilarity.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/file/FileItemSimilarity.java
new file mode 100644
index 0000000..712b96a
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/file/FileItemSimilarity.java
@@ -0,0 +1,137 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.similarity.file;
+
+import java.io.File;
+import java.util.Collection;
+import java.util.concurrent.locks.ReentrantLock;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.similarity.GenericItemSimilarity;
+import org.apache.mahout.cf.taste.similarity.ItemSimilarity;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * <p>
+ * An {@link ItemSimilarity} backed by a comma-delimited file. This class typically expects a file where each line
+ * contains an item ID, followed by another item ID, followed by a similarity value, separated by commas. You may also
+ * use tabs.
+ * </p>
+ *
+ * <p>
+ * The similarity value is assumed to be parseable as a {@code double} having a value between -1 and 1. The
+ * item IDs are parsed as {@code long}s. Similarities are symmetric so for a pair of items you do not have to
+ * include 2 lines in the file.
+ * </p>
+ *
+ * <p>
+ * This class will reload data from the data file when {@link #refresh(Collection)} is called, unless the file
+ * has been reloaded very recently already.
+ * </p>
+ *
+ * <p>
+ * This class is not intended for use with very large amounts of data. For that, a JDBC-backed {@link ItemSimilarity}
+ * and a database are more appropriate.
+ * </p>
+ */
+public class FileItemSimilarity implements ItemSimilarity {
+
+ public static final long DEFAULT_MIN_RELOAD_INTERVAL_MS = 60 * 1000L; // 1 minute?
+
+ private ItemSimilarity delegate;
+ private final ReentrantLock reloadLock;
+ private final File dataFile;
+ private long lastModified;
+ private final long minReloadIntervalMS;
+
+ private static final Logger log = LoggerFactory.getLogger(FileItemSimilarity.class);
+
+ /**
+ * @param dataFile
+ * file containing the similarity data
+ */
+ public FileItemSimilarity(File dataFile) {
+ this(dataFile, DEFAULT_MIN_RELOAD_INTERVAL_MS);
+ }
+
+ /**
+ * @param minReloadIntervalMS
+ * the minimum interval in milliseconds after which a full reload of the original datafile is done
+ * when refresh() is called
+ * @see #FileItemSimilarity(File)
+ */
+ public FileItemSimilarity(File dataFile, long minReloadIntervalMS) {
+ Preconditions.checkArgument(dataFile != null, "dataFile is null");
+ Preconditions.checkArgument(dataFile.exists() && !dataFile.isDirectory(),
+ "dataFile is missing or a directory: %s", dataFile);
+
+ log.info("Creating FileItemSimilarity for file {}", dataFile);
+
+ this.dataFile = dataFile.getAbsoluteFile();
+ this.lastModified = dataFile.lastModified();
+ this.minReloadIntervalMS = minReloadIntervalMS;
+ this.reloadLock = new ReentrantLock();
+
+ reload();
+ }
+
+ @Override
+ public double[] itemSimilarities(long itemID1, long[] itemID2s) throws TasteException {
+ return delegate.itemSimilarities(itemID1, itemID2s);
+ }
+
+ @Override
+ public long[] allSimilarItemIDs(long itemID) throws TasteException {
+ return delegate.allSimilarItemIDs(itemID);
+ }
+
+ @Override
+ public double itemSimilarity(long itemID1, long itemID2) throws TasteException {
+ return delegate.itemSimilarity(itemID1, itemID2);
+ }
+
+ @Override
+ public void refresh(Collection<Refreshable> alreadyRefreshed) {
+ if (dataFile.lastModified() > lastModified + minReloadIntervalMS) {
+ log.debug("File has changed; reloading...");
+ reload();
+ }
+ }
+
+ protected void reload() {
+ if (reloadLock.tryLock()) {
+ try {
+ long newLastModified = dataFile.lastModified();
+ delegate = new GenericItemSimilarity(new FileItemItemSimilarityIterable(dataFile));
+ lastModified = newLastModified;
+ } finally {
+ reloadLock.unlock();
+ }
+ }
+ }
+
+ @Override
+ public String toString() {
+ return "FileItemSimilarity[dataFile:" + dataFile + ']';
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/precompute/FileSimilarItemsWriter.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/precompute/FileSimilarItemsWriter.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/precompute/FileSimilarItemsWriter.java
new file mode 100644
index 0000000..631ec9b
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/precompute/FileSimilarItemsWriter.java
@@ -0,0 +1,67 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.similarity.precompute;
+
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+
+import com.google.common.io.Closeables;
+import org.apache.commons.io.Charsets;
+import org.apache.mahout.cf.taste.similarity.precompute.SimilarItem;
+import org.apache.mahout.cf.taste.similarity.precompute.SimilarItems;
+import org.apache.mahout.cf.taste.similarity.precompute.SimilarItemsWriter;
+
+/**
+ * Persist the precomputed item similarities to a file that can later be used
+ * by a {@link org.apache.mahout.cf.taste.impl.similarity.file.FileItemSimilarity}
+ */
+public class FileSimilarItemsWriter implements SimilarItemsWriter {
+
+ private final File file;
+ private BufferedWriter writer;
+
+ public FileSimilarItemsWriter(File file) {
+ this.file = file;
+ }
+
+ @Override
+ public void open() throws IOException {
+ writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file), Charsets.UTF_8));
+ }
+
+ @Override
+ public void add(SimilarItems similarItems) throws IOException {
+ String itemID = String.valueOf(similarItems.getItemID());
+ for (SimilarItem similarItem : similarItems.getSimilarItems()) {
+ writer.write(itemID);
+ writer.write(',');
+ writer.write(String.valueOf(similarItem.getItemID()));
+ writer.write(',');
+ writer.write(String.valueOf(similarItem.getSimilarity()));
+ writer.newLine();
+ }
+ }
+
+ @Override
+ public void close() throws IOException {
+ Closeables.close(writer, false);
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/precompute/MultithreadedBatchItemSimilarities.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/precompute/MultithreadedBatchItemSimilarities.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/precompute/MultithreadedBatchItemSimilarities.java
new file mode 100644
index 0000000..b7b52cf
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/precompute/MultithreadedBatchItemSimilarities.java
@@ -0,0 +1,230 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.similarity.precompute;
+
+import com.google.common.io.Closeables;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.recommender.ItemBasedRecommender;
+import org.apache.mahout.cf.taste.recommender.RecommendedItem;
+import org.apache.mahout.cf.taste.similarity.precompute.BatchItemSimilarities;
+import org.apache.mahout.cf.taste.similarity.precompute.SimilarItems;
+import org.apache.mahout.cf.taste.similarity.precompute.SimilarItemsWriter;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.concurrent.BlockingQueue;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.LinkedBlockingQueue;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicInteger;
+
+/**
+ * Precompute item similarities in parallel on a single machine. The recommender given to this class must use a
+ * DataModel that holds the interactions in memory (such as
+ * {@link org.apache.mahout.cf.taste.impl.model.GenericDataModel} or
+ * {@link org.apache.mahout.cf.taste.impl.model.file.FileDataModel}) as fast random access to the data is required
+ */
+public class MultithreadedBatchItemSimilarities extends BatchItemSimilarities {
+
+ private int batchSize;
+
+ private static final int DEFAULT_BATCH_SIZE = 100;
+
+ private static final Logger log = LoggerFactory.getLogger(MultithreadedBatchItemSimilarities.class);
+
+ /**
+ * @param recommender recommender to use
+ * @param similarItemsPerItem number of similar items to compute per item
+ */
+ public MultithreadedBatchItemSimilarities(ItemBasedRecommender recommender, int similarItemsPerItem) {
+ this(recommender, similarItemsPerItem, DEFAULT_BATCH_SIZE);
+ }
+
+ /**
+ * @param recommender recommender to use
+ * @param similarItemsPerItem number of similar items to compute per item
+ * @param batchSize size of item batches sent to worker threads
+ */
+ public MultithreadedBatchItemSimilarities(ItemBasedRecommender recommender, int similarItemsPerItem, int batchSize) {
+ super(recommender, similarItemsPerItem);
+ this.batchSize = batchSize;
+ }
+
+ @Override
+ public int computeItemSimilarities(int degreeOfParallelism, int maxDurationInHours, SimilarItemsWriter writer)
+ throws IOException {
+
+ ExecutorService executorService = Executors.newFixedThreadPool(degreeOfParallelism + 1);
+
+ Output output = null;
+ try {
+ writer.open();
+
+ DataModel dataModel = getRecommender().getDataModel();
+
+ BlockingQueue<long[]> itemsIDsInBatches = queueItemIDsInBatches(dataModel, batchSize, degreeOfParallelism);
+ BlockingQueue<List<SimilarItems>> results = new LinkedBlockingQueue<>();
+
+ AtomicInteger numActiveWorkers = new AtomicInteger(degreeOfParallelism);
+ for (int n = 0; n < degreeOfParallelism; n++) {
+ executorService.execute(new SimilarItemsWorker(n, itemsIDsInBatches, results, numActiveWorkers));
+ }
+
+ output = new Output(results, writer, numActiveWorkers);
+ executorService.execute(output);
+
+ } catch (Exception e) {
+ throw new IOException(e);
+ } finally {
+ executorService.shutdown();
+ try {
+ boolean succeeded = executorService.awaitTermination(maxDurationInHours, TimeUnit.HOURS);
+ if (!succeeded) {
+ throw new RuntimeException("Unable to complete the computation in " + maxDurationInHours + " hours!");
+ }
+ } catch (InterruptedException e) {
+ throw new RuntimeException(e);
+ }
+ Closeables.close(writer, false);
+ }
+
+ return output.getNumSimilaritiesProcessed();
+ }
+
+ private static BlockingQueue<long[]> queueItemIDsInBatches(DataModel dataModel, int batchSize,
+ int degreeOfParallelism)
+ throws TasteException {
+
+ LongPrimitiveIterator itemIDs = dataModel.getItemIDs();
+ int numItems = dataModel.getNumItems();
+
+ BlockingQueue<long[]> itemIDBatches = new LinkedBlockingQueue<>((numItems / batchSize) + 1);
+
+ long[] batch = new long[batchSize];
+ int pos = 0;
+ while (itemIDs.hasNext()) {
+ batch[pos] = itemIDs.nextLong();
+ pos++;
+ if (pos == batchSize) {
+ itemIDBatches.add(batch.clone());
+ pos = 0;
+ }
+ }
+
+ if (pos > 0) {
+ long[] lastBatch = new long[pos];
+ System.arraycopy(batch, 0, lastBatch, 0, pos);
+ itemIDBatches.add(lastBatch);
+ }
+
+ if (itemIDBatches.size() < degreeOfParallelism) {
+ throw new IllegalStateException("Degree of parallelism [" + degreeOfParallelism + "] " +
+ " is larger than number of batches [" + itemIDBatches.size() +"].");
+ }
+
+ log.info("Queued {} items in {} batches", numItems, itemIDBatches.size());
+
+ return itemIDBatches;
+ }
+
+
+ private static class Output implements Runnable {
+
+ private final BlockingQueue<List<SimilarItems>> results;
+ private final SimilarItemsWriter writer;
+ private final AtomicInteger numActiveWorkers;
+ private int numSimilaritiesProcessed = 0;
+
+ Output(BlockingQueue<List<SimilarItems>> results, SimilarItemsWriter writer, AtomicInteger numActiveWorkers) {
+ this.results = results;
+ this.writer = writer;
+ this.numActiveWorkers = numActiveWorkers;
+ }
+
+ private int getNumSimilaritiesProcessed() {
+ return numSimilaritiesProcessed;
+ }
+
+ @Override
+ public void run() {
+ while (numActiveWorkers.get() != 0 || !results.isEmpty()) {
+ try {
+ List<SimilarItems> similarItemsOfABatch = results.poll(10, TimeUnit.MILLISECONDS);
+ if (similarItemsOfABatch != null) {
+ for (SimilarItems similarItems : similarItemsOfABatch) {
+ writer.add(similarItems);
+ numSimilaritiesProcessed += similarItems.numSimilarItems();
+ }
+ }
+ } catch (Exception e) {
+ throw new RuntimeException(e);
+ }
+ }
+ }
+ }
+
+ private class SimilarItemsWorker implements Runnable {
+
+ private final int number;
+ private final BlockingQueue<long[]> itemIDBatches;
+ private final BlockingQueue<List<SimilarItems>> results;
+ private final AtomicInteger numActiveWorkers;
+
+ SimilarItemsWorker(int number, BlockingQueue<long[]> itemIDBatches, BlockingQueue<List<SimilarItems>> results,
+ AtomicInteger numActiveWorkers) {
+ this.number = number;
+ this.itemIDBatches = itemIDBatches;
+ this.results = results;
+ this.numActiveWorkers = numActiveWorkers;
+ }
+
+ @Override
+ public void run() {
+
+ int numBatchesProcessed = 0;
+ while (!itemIDBatches.isEmpty()) {
+ try {
+ long[] itemIDBatch = itemIDBatches.take();
+
+ List<SimilarItems> similarItemsOfBatch = new ArrayList<>(itemIDBatch.length);
+ for (long itemID : itemIDBatch) {
+ List<RecommendedItem> similarItems = getRecommender().mostSimilarItems(itemID, getSimilarItemsPerItem());
+ similarItemsOfBatch.add(new SimilarItems(itemID, similarItems));
+ }
+
+ results.offer(similarItemsOfBatch);
+
+ if (++numBatchesProcessed % 5 == 0) {
+ log.info("worker {} processed {} batches", number, numBatchesProcessed);
+ }
+
+ } catch (Exception e) {
+ throw new RuntimeException(e);
+ }
+ }
+ log.info("worker {} processed {} batches. done.", number, numBatchesProcessed);
+ numActiveWorkers.decrementAndGet();
+ }
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/model/DataModel.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/model/DataModel.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/model/DataModel.java
new file mode 100644
index 0000000..022d02d
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/model/DataModel.java
@@ -0,0 +1,199 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.model;
+
+import java.io.Serializable;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+
+/**
+ * <p>
+ * Implementations represent a repository of information about users and their associated {@link Preference}s
+ * for items.
+ * </p>
+ */
+public interface DataModel extends Refreshable, Serializable {
+
+ /**
+ * @return all user IDs in the model, in order
+ * @throws TasteException
+ * if an error occurs while accessing the data
+ */
+ LongPrimitiveIterator getUserIDs() throws TasteException;
+
+ /**
+ * @param userID
+ * ID of user to get prefs for
+ * @return user's preferences, ordered by item ID
+ * @throws org.apache.mahout.cf.taste.common.NoSuchUserException
+ * if the user does not exist
+ * @throws TasteException
+ * if an error occurs while accessing the data
+ */
+ PreferenceArray getPreferencesFromUser(long userID) throws TasteException;
+
+ /**
+ * @param userID
+ * ID of user to get prefs for
+ * @return IDs of items user expresses a preference for
+ * @throws org.apache.mahout.cf.taste.common.NoSuchUserException
+ * if the user does not exist
+ * @throws TasteException
+ * if an error occurs while accessing the data
+ */
+ FastIDSet getItemIDsFromUser(long userID) throws TasteException;
+
+ /**
+ * @return a {@link LongPrimitiveIterator} of all item IDs in the model, in order
+ * @throws TasteException
+ * if an error occurs while accessing the data
+ */
+ LongPrimitiveIterator getItemIDs() throws TasteException;
+
+ /**
+ * @param itemID
+ * item ID
+ * @return all existing {@link Preference}s expressed for that item, ordered by user ID, as an array
+ * @throws org.apache.mahout.cf.taste.common.NoSuchItemException
+ * if the item does not exist
+ * @throws TasteException
+ * if an error occurs while accessing the data
+ */
+ PreferenceArray getPreferencesForItem(long itemID) throws TasteException;
+
+ /**
+ * Retrieves the preference value for a single user and item.
+ *
+ * @param userID
+ * user ID to get pref value from
+ * @param itemID
+ * item ID to get pref value for
+ * @return preference value from the given user for the given item or null if none exists
+ * @throws org.apache.mahout.cf.taste.common.NoSuchUserException
+ * if the user does not exist
+ * @throws TasteException
+ * if an error occurs while accessing the data
+ */
+ Float getPreferenceValue(long userID, long itemID) throws TasteException;
+
+ /**
+ * Retrieves the time at which a preference value from a user and item was set, if known.
+ * Time is expressed in the usual way, as a number of milliseconds since the epoch.
+ *
+ * @param userID user ID for preference in question
+ * @param itemID item ID for preference in question
+ * @return time at which preference was set or null if no preference exists or its time is not known
+ * @throws org.apache.mahout.cf.taste.common.NoSuchUserException if the user does not exist
+ * @throws TasteException if an error occurs while accessing the data
+ */
+ Long getPreferenceTime(long userID, long itemID) throws TasteException;
+
+ /**
+ * @return total number of items known to the model. This is generally the union of all items preferred by
+ * at least one user but could include more.
+ * @throws TasteException
+ * if an error occurs while accessing the data
+ */
+ int getNumItems() throws TasteException;
+
+ /**
+ * @return total number of users known to the model.
+ * @throws TasteException
+ * if an error occurs while accessing the data
+ */
+ int getNumUsers() throws TasteException;
+
+ /**
+ * @param itemID item ID to check for
+ * @return the number of users who have expressed a preference for the item
+ * @throws TasteException if an error occurs while accessing the data
+ */
+ int getNumUsersWithPreferenceFor(long itemID) throws TasteException;
+
+ /**
+ * @param itemID1 first item ID to check for
+ * @param itemID2 second item ID to check for
+ * @return the number of users who have expressed a preference for the items
+ * @throws TasteException if an error occurs while accessing the data
+ */
+ int getNumUsersWithPreferenceFor(long itemID1, long itemID2) throws TasteException;
+
+ /**
+ * <p>
+ * Sets a particular preference (item plus rating) for a user.
+ * </p>
+ *
+ * @param userID
+ * user to set preference for
+ * @param itemID
+ * item to set preference for
+ * @param value
+ * preference value
+ * @throws org.apache.mahout.cf.taste.common.NoSuchItemException
+ * if the item does not exist
+ * @throws org.apache.mahout.cf.taste.common.NoSuchUserException
+ * if the user does not exist
+ * @throws TasteException
+ * if an error occurs while accessing the data
+ */
+ void setPreference(long userID, long itemID, float value) throws TasteException;
+
+ /**
+ * <p>
+ * Removes a particular preference for a user.
+ * </p>
+ *
+ * @param userID
+ * user from which to remove preference
+ * @param itemID
+ * item to remove preference for
+ * @throws org.apache.mahout.cf.taste.common.NoSuchItemException
+ * if the item does not exist
+ * @throws org.apache.mahout.cf.taste.common.NoSuchUserException
+ * if the user does not exist
+ * @throws TasteException
+ * if an error occurs while accessing the data
+ */
+ void removePreference(long userID, long itemID) throws TasteException;
+
+ /**
+ * @return true if this implementation actually stores and returns distinct preference values;
+ * that is, if it is not a 'boolean' DataModel
+ */
+ boolean hasPreferenceValues();
+
+ /**
+ * @return the maximum preference value that is possible in the current problem domain being evaluated. For
+ * example, if the domain is movie ratings on a scale of 1 to 5, this should be 5. While a
+ * {@link org.apache.mahout.cf.taste.recommender.Recommender} may estimate a preference value above 5.0, it
+ * isn't "fair" to consider that the system is actually suggesting an impossible rating of, say, 5.4 stars.
+ * In practice the application would cap this estimate to 5.0. Since evaluators evaluate
+ * the difference between estimated and actual value, this at least prevents this effect from unfairly
+ * penalizing a {@link org.apache.mahout.cf.taste.recommender.Recommender}
+ */
+ float getMaxPreference();
+
+ /**
+ * @see #getMaxPreference()
+ */
+ float getMinPreference();
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/model/IDMigrator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/model/IDMigrator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/model/IDMigrator.java
new file mode 100644
index 0000000..cc477fe
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/model/IDMigrator.java
@@ -0,0 +1,63 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.model;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+
+/**
+ * <p>
+ * Mahout 0.2 changed the framework to operate only in terms of numeric (long) ID values for users and items.
+ * This is, obviously, not compatible with applications that used other key types -- most commonly
+ * {@link String}. Implementation of this class provide support for mapping String to longs and vice versa in
+ * order to provide a smoother migration path to applications that must still use strings as IDs.
+ * </p>
+ *
+ * <p>
+ * The mapping from strings to 64-bit numeric values is fixed here, to provide a standard implementation that
+ * is 'portable' or reproducible outside the framework easily. See {@link #toLongID(String)}.
+ * </p>
+ *
+ * <p>
+ * Because this mapping is deterministically computable, it does not need to be stored. Indeed, subclasses'
+ * job is to store the reverse mapping. There are an infinite number of strings but only a fixed number of
+ * longs, so, it is possible for two strings to map to the same value. Subclasses do not treat this as an
+ * error but rather retain only the most recent mapping, overwriting a previous mapping. The probability of
+ * collision in a 64-bit space is quite small, but not zero. However, in the context of a collaborative
+ * filtering problem, the consequence of a collision is small, at worst -- perhaps one user receives another
+ * recommendations.
+ * </p>
+ *
+ * @since 0.2
+ */
+public interface IDMigrator extends Refreshable {
+
+ /**
+ * @return the top 8 bytes of the MD5 hash of the bytes of the given {@link String}'s UTF-8 encoding as a
+ * long.
+ */
+ long toLongID(String stringID);
+
+ /**
+ * @return the string ID most recently associated with the given long ID, or null if doesn't exist
+ * @throws TasteException
+ * if an error occurs while retrieving the mapping
+ */
+ String toStringID(long longID) throws TasteException;
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/model/JDBCDataModel.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/model/JDBCDataModel.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/model/JDBCDataModel.java
new file mode 100644
index 0000000..e91ed48
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/model/JDBCDataModel.java
@@ -0,0 +1,43 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.model;
+
+import javax.sql.DataSource;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+
+public interface JDBCDataModel extends DataModel {
+
+ /**
+ * @return {@link DataSource} underlying this model
+ */
+ DataSource getDataSource();
+
+ /**
+ * Hmm, should this exist elsewhere? seems like most relevant for a DB implementation, which is not in
+ * memory, which might want to export to memory.
+ *
+ * @return all user preference data
+ */
+ FastByIDMap<PreferenceArray> exportWithPrefs() throws TasteException;
+
+ FastByIDMap<FastIDSet> exportWithIDsOnly() throws TasteException;
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/model/Preference.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/model/Preference.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/model/Preference.java
new file mode 100644
index 0000000..fe0150a
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/model/Preference.java
@@ -0,0 +1,48 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.model;
+
+/**
+ * <p>
+ * A {@link Preference} encapsulates an item and a preference value, which indicates the strength of the
+ * preference for it. {@link Preference}s are associated to users.
+ * </p>
+ */
+public interface Preference {
+
+ /** @return ID of user who prefers the item */
+ long getUserID();
+
+ /** @return item ID that is preferred */
+ long getItemID();
+
+ /**
+ * @return strength of the preference for that item. Zero should indicate "no preference either way";
+ * positive values indicate preference and negative values indicate dislike
+ */
+ float getValue();
+
+ /**
+ * Sets the strength of the preference for this item
+ *
+ * @param value
+ * new preference
+ */
+ void setValue(float value);
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/model/PreferenceArray.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/model/PreferenceArray.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/model/PreferenceArray.java
new file mode 100644
index 0000000..3886bc6
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/model/PreferenceArray.java
@@ -0,0 +1,143 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.model;
+
+import java.io.Serializable;
+
+/**
+ * An alternate representation of an array of {@link Preference}. Implementations, in theory, can produce a
+ * more memory-efficient representation.
+ */
+public interface PreferenceArray extends Cloneable, Serializable, Iterable<Preference> {
+
+ /**
+ * @return size of length of the "array"
+ */
+ int length();
+
+ /**
+ * @param i
+ * index
+ * @return a materialized {@link Preference} representation of the preference at i
+ */
+ Preference get(int i);
+
+ /**
+ * Sets preference at i from information in the given {@link Preference}
+ *
+ * @param i
+ * @param pref
+ */
+ void set(int i, Preference pref);
+
+ /**
+ * @param i
+ * index
+ * @return user ID from preference at i
+ */
+ long getUserID(int i);
+
+ /**
+ * Sets user ID for preference at i.
+ *
+ * @param i
+ * index
+ * @param userID
+ * new user ID
+ */
+ void setUserID(int i, long userID);
+
+ /**
+ * @param i
+ * index
+ * @return item ID from preference at i
+ */
+ long getItemID(int i);
+
+ /**
+ * Sets item ID for preference at i.
+ *
+ * @param i
+ * index
+ * @param itemID
+ * new item ID
+ */
+ void setItemID(int i, long itemID);
+
+ /**
+ * @return all user or item IDs
+ */
+ long[] getIDs();
+
+ /**
+ * @param i
+ * index
+ * @return preference value from preference at i
+ */
+ float getValue(int i);
+
+ /**
+ * Sets preference value for preference at i.
+ *
+ * @param i
+ * index
+ * @param value
+ * new preference value
+ */
+ void setValue(int i, float value);
+
+ /**
+ * @return independent copy of this object
+ */
+ PreferenceArray clone();
+
+ /**
+ * Sorts underlying array by user ID, ascending.
+ */
+ void sortByUser();
+
+ /**
+ * Sorts underlying array by item ID, ascending.
+ */
+ void sortByItem();
+
+ /**
+ * Sorts underlying array by preference value, ascending.
+ */
+ void sortByValue();
+
+ /**
+ * Sorts underlying array by preference value, descending.
+ */
+ void sortByValueReversed();
+
+ /**
+ * @param userID
+ * user ID
+ * @return true if array contains a preference with given user ID
+ */
+ boolean hasPrefWithUserID(long userID);
+
+ /**
+ * @param itemID
+ * item ID
+ * @return true if array contains a preference with given item ID
+ */
+ boolean hasPrefWithItemID(long itemID);
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/model/UpdatableIDMigrator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/model/UpdatableIDMigrator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/model/UpdatableIDMigrator.java
new file mode 100644
index 0000000..ff29a34
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/model/UpdatableIDMigrator.java
@@ -0,0 +1,47 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.model;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+
+public interface UpdatableIDMigrator extends IDMigrator {
+
+ /**
+ * Stores the reverse long-to-String mapping in some kind of backing store. Note that this must be called
+ * directly (or indirectly through {@link #initialize(Iterable)}) for every String that might be encountered
+ * in the application, or else the mapping will not be known.
+ *
+ * @param longID
+ * long ID
+ * @param stringID
+ * string ID that maps to/from that long ID
+ * @throws TasteException
+ * if an error occurs while saving the mapping
+ */
+ void storeMapping(long longID, String stringID) throws TasteException;
+
+ /**
+ * Make the mapping aware of the given string IDs. This must be called initially before the implementation
+ * is used, or else it will not be aware of reverse long-to-String mappings.
+ *
+ * @throws TasteException
+ * if an error occurs while storing the mappings
+ */
+ void initialize(Iterable<String> stringIDs) throws TasteException;
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/neighborhood/UserNeighborhood.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/neighborhood/UserNeighborhood.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/neighborhood/UserNeighborhood.java
new file mode 100644
index 0000000..2a143e1
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/neighborhood/UserNeighborhood.java
@@ -0,0 +1,40 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.neighborhood;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+
+/**
+ * <p>
+ * Implementations of this interface compute a "neighborhood" of users like a given user. This neighborhood
+ * can be used to compute recommendations then.
+ * </p>
+ */
+public interface UserNeighborhood extends Refreshable {
+
+ /**
+ * @param userID
+ * ID of user for which a neighborhood will be computed
+ * @return IDs of users in the neighborhood
+ * @throws TasteException
+ * if an error occurs while accessing data
+ */
+ long[] getUserNeighborhood(long userID) throws TasteException;
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/recommender/CandidateItemsStrategy.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/recommender/CandidateItemsStrategy.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/recommender/CandidateItemsStrategy.java
new file mode 100644
index 0000000..ada1949
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/recommender/CandidateItemsStrategy.java
@@ -0,0 +1,37 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.recommender;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+
+/**
+ * Used to retrieve all items that could possibly be recommended to the user
+ */
+public interface CandidateItemsStrategy extends Refreshable {
+
+ /**
+ * @return IDs of all items that could be recommended to the user
+ */
+ FastIDSet getCandidateItems(long userID, PreferenceArray preferencesFromUser, DataModel dataModel,
+ boolean includeKnownItems) throws TasteException;
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/recommender/IDRescorer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/recommender/IDRescorer.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/recommender/IDRescorer.java
new file mode 100644
index 0000000..d9a9cf7
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/recommender/IDRescorer.java
@@ -0,0 +1,47 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.recommender;
+
+/**
+ * <p>
+ * A {@link Rescorer} which operates on {@code long} primitive IDs, rather than arbitrary {@link Object}s.
+ * This is provided since most uses of this interface in the framework take IDs (as {@code long}) as an
+ * argument, and so this can be used to avoid unnecessary boxing/unboxing.
+ * </p>
+ */
+public interface IDRescorer {
+
+ /**
+ * @param id
+ * ID of thing (user, item, etc.) to rescore
+ * @param originalScore
+ * original score
+ * @return modified score, or {@link Double#NaN} to indicate that this should be excluded entirely
+ */
+ double rescore(long id, double originalScore);
+
+ /**
+ * Returns {@code true} to exclude the given thing.
+ *
+ * @param id
+ * ID of thing (user, item, etc.) to rescore
+ * @return {@code true} to exclude, {@code false} otherwise
+ */
+ boolean isFiltered(long id);
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/recommender/ItemBasedRecommender.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/recommender/ItemBasedRecommender.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/recommender/ItemBasedRecommender.java
new file mode 100644
index 0000000..570f851
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/recommender/ItemBasedRecommender.java
@@ -0,0 +1,145 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.recommender;
+
+import java.util.List;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.common.LongPair;
+
+/**
+ * <p>
+ * Interface implemented by "item-based" recommenders.
+ * </p>
+ */
+public interface ItemBasedRecommender extends Recommender {
+
+ /**
+ * @param itemID
+ * ID of item for which to find most similar other items
+ * @param howMany
+ * desired number of most similar items to find
+ * @return items most similar to the given item, ordered from most similar to least
+ * @throws TasteException
+ * if an error occurs while accessing the {@link org.apache.mahout.cf.taste.model.DataModel}
+ */
+ List<RecommendedItem> mostSimilarItems(long itemID, int howMany) throws TasteException;
+
+ /**
+ * @param itemID
+ * ID of item for which to find most similar other items
+ * @param howMany
+ * desired number of most similar items to find
+ * @param rescorer
+ * {@link Rescorer} which can adjust item-item similarity estimates used to determine most similar
+ * items
+ * @return itemss most similar to the given item, ordered from most similar to least
+ * @throws TasteException
+ * if an error occurs while accessing the {@link org.apache.mahout.cf.taste.model.DataModel}
+ */
+ List<RecommendedItem> mostSimilarItems(long itemID, int howMany, Rescorer<LongPair> rescorer) throws TasteException;
+
+ /**
+ * @param itemIDs
+ * IDs of item for which to find most similar other items
+ * @param howMany
+ * desired number of most similar items to find estimates used to determine most similar items
+ * @return items most similar to the given items, ordered from most similar to least
+ * @throws TasteException
+ * if an error occurs while accessing the {@link org.apache.mahout.cf.taste.model.DataModel}
+ */
+ List<RecommendedItem> mostSimilarItems(long[] itemIDs, int howMany) throws TasteException;
+
+ /**
+ * @param itemIDs
+ * IDs of item for which to find most similar other items
+ * @param howMany
+ * desired number of most similar items to find
+ * @param rescorer
+ * {@link Rescorer} which can adjust item-item similarity estimates used to determine most similar
+ * items
+ * @return items most similar to the given items, ordered from most similar to least
+ * @throws TasteException
+ * if an error occurs while accessing the {@link org.apache.mahout.cf.taste.model.DataModel}
+ */
+ List<RecommendedItem> mostSimilarItems(long[] itemIDs,
+ int howMany,
+ Rescorer<LongPair> rescorer) throws TasteException;
+
+ /**
+ * @param itemIDs
+ * IDs of item for which to find most similar other items
+ * @param howMany
+ * desired number of most similar items to find
+ * @param excludeItemIfNotSimilarToAll
+ * exclude an item if it is not similar to each of the input items
+ * @return items most similar to the given items, ordered from most similar to least
+ * @throws TasteException
+ * if an error occurs while accessing the {@link org.apache.mahout.cf.taste.model.DataModel}
+ */
+ List<RecommendedItem> mostSimilarItems(long[] itemIDs,
+ int howMany,
+ boolean excludeItemIfNotSimilarToAll) throws TasteException;
+
+ /**
+ * @param itemIDs
+ * IDs of item for which to find most similar other items
+ * @param howMany
+ * desired number of most similar items to find
+ * @param rescorer
+ * {@link Rescorer} which can adjust item-item similarity estimates used to determine most similar
+ * items
+ * @param excludeItemIfNotSimilarToAll
+ * exclude an item if it is not similar to each of the input items
+ * @return items most similar to the given items, ordered from most similar to least
+ * @throws TasteException
+ * if an error occurs while accessing the {@link org.apache.mahout.cf.taste.model.DataModel}
+ */
+ List<RecommendedItem> mostSimilarItems(long[] itemIDs,
+ int howMany,
+ Rescorer<LongPair> rescorer,
+ boolean excludeItemIfNotSimilarToAll) throws TasteException;
+
+ /**
+ * <p>
+ * Lists the items that were most influential in recommending a given item to a given user. Exactly how this
+ * is determined is left to the implementation, but, generally this will return items that the user prefers
+ * and that are similar to the given item.
+ * </p>
+ *
+ * <p>
+ * This returns a {@link List} of {@link RecommendedItem} which is a little misleading since it's returning
+ * recommend<strong>ing</strong> items, but, I thought it more natural to just reuse this class since it
+ * encapsulates an item and value. The value here does not necessarily have a consistent interpretation or
+ * expected range; it will be higher the more influential the item was in the recommendation.
+ * </p>
+ *
+ * @param userID
+ * ID of user who was recommended the item
+ * @param itemID
+ * ID of item that was recommended
+ * @param howMany
+ * maximum number of items to return
+ * @return {@link List} of {@link RecommendedItem}, ordered from most influential in recommended the given
+ * item to least
+ * @throws TasteException
+ * if an error occurs while accessing the {@link org.apache.mahout.cf.taste.model.DataModel}
+ */
+ List<RecommendedItem> recommendedBecause(long userID, long itemID, int howMany) throws TasteException;
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/recommender/MostSimilarItemsCandidateItemsStrategy.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/recommender/MostSimilarItemsCandidateItemsStrategy.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/recommender/MostSimilarItemsCandidateItemsStrategy.java
new file mode 100644
index 0000000..282ceff
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/recommender/MostSimilarItemsCandidateItemsStrategy.java
@@ -0,0 +1,31 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.recommender;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.model.DataModel;
+
+/**
+ * Used to retrieve all items that could possibly be similar
+ */
+public interface MostSimilarItemsCandidateItemsStrategy extends Refreshable {
+
+ FastIDSet getCandidateItems(long[] itemIDs, DataModel dataModel) throws TasteException;
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/recommender/RecommendedItem.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/recommender/RecommendedItem.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/recommender/RecommendedItem.java
new file mode 100644
index 0000000..1fcece8
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/recommender/RecommendedItem.java
@@ -0,0 +1,41 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.recommender;
+
+/**
+ * <p>
+ * Implementations encapsulate items that are recommended, and include the item recommended and a value
+ * expressing the strength of the preference.
+ * </p>
+ */
+public interface RecommendedItem {
+
+ /** @return the recommended item ID */
+ long getItemID();
+
+ /**
+ * <p>
+ * A value expressing the strength of the preference for the recommended item. The range of the values
+ * depends on the implementation. Implementations must use larger values to express stronger preference.
+ * </p>
+ *
+ * @return strength of the preference
+ */
+ float getValue();
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/recommender/Recommender.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/recommender/Recommender.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/recommender/Recommender.java
new file mode 100644
index 0000000..4135aff
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/recommender/Recommender.java
@@ -0,0 +1,132 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.recommender;
+
+import java.util.List;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.model.DataModel;
+
+/**
+ * <p>
+ * Implementations of this interface can recommend items for a user. Implementations will likely take
+ * advantage of several classes in other packages here to compute this.
+ * </p>
+ */
+public interface Recommender extends Refreshable {
+
+ /**
+ * @param userID
+ * user for which recommendations are to be computed
+ * @param howMany
+ * desired number of recommendations
+ * @return {@link List} of recommended {@link RecommendedItem}s, ordered from most strongly recommend to
+ * least
+ * @throws TasteException
+ * if an error occurs while accessing the {@link DataModel}
+ */
+ List<RecommendedItem> recommend(long userID, int howMany) throws TasteException;
+
+ /**
+ * @param userID
+ * user for which recommendations are to be computed
+ * @param howMany
+ * desired number of recommendations
+ * @return {@link List} of recommended {@link RecommendedItem}s, ordered from most strongly recommend to
+ * least
+ * @param includeKnownItems
+ * whether to include items already known by the user in recommendations
+ * @throws TasteException
+ * if an error occurs while accessing the {@link DataModel}
+ */
+ List<RecommendedItem> recommend(long userID, int howMany, boolean includeKnownItems) throws TasteException;
+
+ /**
+ * @param userID
+ * user for which recommendations are to be computed
+ * @param howMany
+ * desired number of recommendations
+ * @param rescorer
+ * rescoring function to apply before final list of recommendations is determined
+ * @return {@link List} of recommended {@link RecommendedItem}s, ordered from most strongly recommend to
+ * least
+ * @throws TasteException
+ * if an error occurs while accessing the {@link DataModel}
+ */
+ List<RecommendedItem> recommend(long userID, int howMany, IDRescorer rescorer) throws TasteException;
+
+ /**
+ * @param userID
+ * user for which recommendations are to be computed
+ * @param howMany
+ * desired number of recommendations
+ * @param rescorer
+ * rescoring function to apply before final list of recommendations is determined
+ * @param includeKnownItems
+ * whether to include items already known by the user in recommendations
+ * @return {@link List} of recommended {@link RecommendedItem}s, ordered from most strongly recommend to
+ * least
+ * @throws TasteException
+ * if an error occurs while accessing the {@link DataModel}
+ */
+
+ List<RecommendedItem> recommend(long userID, int howMany, IDRescorer rescorer, boolean includeKnownItems)
+ throws TasteException;
+
+ /**
+ * @param userID
+ * user ID whose preference is to be estimated
+ * @param itemID
+ * item ID to estimate preference for
+ * @return an estimated preference if the user has not expressed a preference for the item, or else the
+ * user's actual preference for the item. If a preference cannot be estimated, returns
+ * {@link Double#NaN}
+ * @throws TasteException
+ * if an error occurs while accessing the {@link DataModel}
+ */
+ float estimatePreference(long userID, long itemID) throws TasteException;
+
+ /**
+ * @param userID
+ * user to set preference for
+ * @param itemID
+ * item to set preference for
+ * @param value
+ * preference value
+ * @throws TasteException
+ * if an error occurs while accessing the {@link DataModel}
+ */
+ void setPreference(long userID, long itemID, float value) throws TasteException;
+
+ /**
+ * @param userID
+ * user from which to remove preference
+ * @param itemID
+ * item for which to remove preference
+ * @throws TasteException
+ * if an error occurs while accessing the {@link DataModel}
+ */
+ void removePreference(long userID, long itemID) throws TasteException;
+
+ /**
+ * @return underlying {@link DataModel} used by this {@link Recommender} implementation
+ */
+ DataModel getDataModel();
+
+}
r***@apache.org
2018-06-28 14:54:52 UTC
Permalink
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/preparation/ToItemVectorsReducer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/preparation/ToItemVectorsReducer.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/preparation/ToItemVectorsReducer.java
new file mode 100644
index 0000000..f74511b
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/preparation/ToItemVectorsReducer.java
@@ -0,0 +1,38 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.preparation;
+
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.mahout.math.VectorWritable;
+
+import java.io.IOException;
+
+public class ToItemVectorsReducer extends Reducer<IntWritable,VectorWritable,IntWritable,VectorWritable> {
+
+ private final VectorWritable merged = new VectorWritable();
+
+ @Override
+ protected void reduce(IntWritable row, Iterable<VectorWritable> vectors, Context ctx)
+ throws IOException, InterruptedException {
+
+ merged.setWritesLaxPrecision(true);
+ merged.set(VectorWritable.mergeToVector(vectors.iterator()));
+ ctx.write(row, merged);
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java
new file mode 100644
index 0000000..c50fa20
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java
@@ -0,0 +1,233 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.similarity.item;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import com.google.common.base.Preconditions;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+
+import org.apache.hadoop.io.DoubleWritable;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.cf.taste.hadoop.EntityEntityWritable;
+import org.apache.mahout.cf.taste.hadoop.TasteHadoopUtils;
+import org.apache.mahout.cf.taste.hadoop.preparation.PreparePreferenceMatrixJob;
+import org.apache.mahout.cf.taste.similarity.precompute.SimilarItem;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+import org.apache.mahout.math.hadoop.similarity.cooccurrence.RowSimilarityJob;
+import org.apache.mahout.math.hadoop.similarity.cooccurrence.measures.VectorSimilarityMeasures;
+import org.apache.mahout.math.map.OpenIntLongHashMap;
+
+/**
+ * <p>Distributed precomputation of the item-item-similarities for Itembased Collaborative Filtering</p>
+ *
+ * <p>Preferences in the input file should look like {@code userID,itemID[,preferencevalue]}</p>
+ *
+ * <p>
+ * Preference value is optional to accommodate applications that have no notion of a preference value (that is, the user
+ * simply expresses a preference for an item, but no degree of preference).
+ * </p>
+ *
+ * <p>
+ * The preference value is assumed to be parseable as a {@code double}. The user IDs and item IDs are
+ * parsed as {@code long}s.
+ * </p>
+ *
+ * <p>Command line arguments specific to this class are:</p>
+ *
+ * <ol>
+ * <li>--input (path): Directory containing one or more text files with the preference data</li>
+ * <li>--output (path): output path where similarity data should be written</li>
+ * <li>--similarityClassname (classname): Name of distributed similarity measure class to instantiate or a predefined
+ * similarity from {@link org.apache.mahout.math.hadoop.similarity.cooccurrence.measures.VectorSimilarityMeasure}</li>
+ * <li>--maxSimilaritiesPerItem (integer): Maximum number of similarities considered per item (100)</li>
+ * <li>--maxPrefsPerUser (integer): max number of preferences to consider per user, users with more preferences will
+ * be sampled down (1000)</li>
+ * <li>--minPrefsPerUser (integer): ignore users with less preferences than this (1)</li>
+ * <li>--booleanData (boolean): Treat input data as having no pref values (false)</li>
+ * <li>--threshold (double): discard item pairs with a similarity value below this</li>
+ * </ol>
+ *
+ * <p>General command line options are documented in {@link AbstractJob}.</p>
+ *
+ * <p>Note that because of how Hadoop parses arguments, all "-D" arguments must appear before all other arguments.</p>
+ */
+public final class ItemSimilarityJob extends AbstractJob {
+
+ public static final String ITEM_ID_INDEX_PATH_STR = ItemSimilarityJob.class.getName() + ".itemIDIndexPathStr";
+ public static final String MAX_SIMILARITIES_PER_ITEM = ItemSimilarityJob.class.getName() + ".maxSimilarItemsPerItem";
+
+ private static final int DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM = 100;
+ private static final int DEFAULT_MAX_PREFS = 500;
+ private static final int DEFAULT_MIN_PREFS_PER_USER = 1;
+
+ public static void main(String[] args) throws Exception {
+ ToolRunner.run(new ItemSimilarityJob(), args);
+ }
+
+ @Override
+ public int run(String[] args) throws Exception {
+
+ addInputOption();
+ addOutputOption();
+ addOption("similarityClassname", "s", "Name of distributed similarity measures class to instantiate, "
+ + "alternatively use one of the predefined similarities (" + VectorSimilarityMeasures.list() + ')');
+ addOption("maxSimilaritiesPerItem", "m", "try to cap the number of similar items per item to this number "
+ + "(default: " + DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM + ')',
+ String.valueOf(DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM));
+ addOption("maxPrefs", "mppu", "max number of preferences to consider per user or item, "
+ + "users or items with more preferences will be sampled down (default: " + DEFAULT_MAX_PREFS + ')',
+ String.valueOf(DEFAULT_MAX_PREFS));
+ addOption("minPrefsPerUser", "mp", "ignore users with less preferences than this "
+ + "(default: " + DEFAULT_MIN_PREFS_PER_USER + ')', String.valueOf(DEFAULT_MIN_PREFS_PER_USER));
+ addOption("booleanData", "b", "Treat input as without pref values", String.valueOf(Boolean.FALSE));
+ addOption("threshold", "tr", "discard item pairs with a similarity value below this", false);
+ addOption("randomSeed", null, "use this seed for sampling", false);
+
+ Map<String,List<String>> parsedArgs = parseArguments(args);
+ if (parsedArgs == null) {
+ return -1;
+ }
+
+ String similarityClassName = getOption("similarityClassname");
+ int maxSimilarItemsPerItem = Integer.parseInt(getOption("maxSimilaritiesPerItem"));
+ int maxPrefs = Integer.parseInt(getOption("maxPrefs"));
+ int minPrefsPerUser = Integer.parseInt(getOption("minPrefsPerUser"));
+ boolean booleanData = Boolean.valueOf(getOption("booleanData"));
+
+ double threshold = hasOption("threshold")
+ ? Double.parseDouble(getOption("threshold")) : RowSimilarityJob.NO_THRESHOLD;
+ long randomSeed = hasOption("randomSeed")
+ ? Long.parseLong(getOption("randomSeed")) : RowSimilarityJob.NO_FIXED_RANDOM_SEED;
+
+ Path similarityMatrixPath = getTempPath("similarityMatrix");
+ Path prepPath = getTempPath("prepareRatingMatrix");
+
+ AtomicInteger currentPhase = new AtomicInteger();
+
+ if (shouldRunNextPhase(parsedArgs, currentPhase)) {
+ ToolRunner.run(getConf(), new PreparePreferenceMatrixJob(), new String[] {
+ "--input", getInputPath().toString(),
+ "--output", prepPath.toString(),
+ "--minPrefsPerUser", String.valueOf(minPrefsPerUser),
+ "--booleanData", String.valueOf(booleanData),
+ "--tempDir", getTempPath().toString(),
+ });
+ }
+
+ if (shouldRunNextPhase(parsedArgs, currentPhase)) {
+ int numberOfUsers = HadoopUtil.readInt(new Path(prepPath, PreparePreferenceMatrixJob.NUM_USERS), getConf());
+
+ ToolRunner.run(getConf(), new RowSimilarityJob(), new String[] {
+ "--input", new Path(prepPath, PreparePreferenceMatrixJob.RATING_MATRIX).toString(),
+ "--output", similarityMatrixPath.toString(),
+ "--numberOfColumns", String.valueOf(numberOfUsers),
+ "--similarityClassname", similarityClassName,
+ "--maxObservationsPerRow", String.valueOf(maxPrefs),
+ "--maxObservationsPerColumn", String.valueOf(maxPrefs),
+ "--maxSimilaritiesPerRow", String.valueOf(maxSimilarItemsPerItem),
+ "--excludeSelfSimilarity", String.valueOf(Boolean.TRUE),
+ "--threshold", String.valueOf(threshold),
+ "--randomSeed", String.valueOf(randomSeed),
+ "--tempDir", getTempPath().toString(),
+ });
+ }
+
+ if (shouldRunNextPhase(parsedArgs, currentPhase)) {
+ Job mostSimilarItems = prepareJob(similarityMatrixPath, getOutputPath(), SequenceFileInputFormat.class,
+ MostSimilarItemPairsMapper.class, EntityEntityWritable.class, DoubleWritable.class,
+ MostSimilarItemPairsReducer.class, EntityEntityWritable.class, DoubleWritable.class, TextOutputFormat.class);
+ Configuration mostSimilarItemsConf = mostSimilarItems.getConfiguration();
+ mostSimilarItemsConf.set(ITEM_ID_INDEX_PATH_STR,
+ new Path(prepPath, PreparePreferenceMatrixJob.ITEMID_INDEX).toString());
+ mostSimilarItemsConf.setInt(MAX_SIMILARITIES_PER_ITEM, maxSimilarItemsPerItem);
+ boolean succeeded = mostSimilarItems.waitForCompletion(true);
+ if (!succeeded) {
+ return -1;
+ }
+ }
+
+ return 0;
+ }
+
+ public static class MostSimilarItemPairsMapper
+ extends Mapper<IntWritable,VectorWritable,EntityEntityWritable,DoubleWritable> {
+
+ private OpenIntLongHashMap indexItemIDMap;
+ private int maxSimilarItemsPerItem;
+
+ @Override
+ protected void setup(Context ctx) {
+ Configuration conf = ctx.getConfiguration();
+ maxSimilarItemsPerItem = conf.getInt(MAX_SIMILARITIES_PER_ITEM, -1);
+ indexItemIDMap = TasteHadoopUtils.readIDIndexMap(conf.get(ITEM_ID_INDEX_PATH_STR), conf);
+
+ Preconditions.checkArgument(maxSimilarItemsPerItem > 0, "maxSimilarItemsPerItem must be greater then 0!");
+ }
+
+ @Override
+ protected void map(IntWritable itemIDIndexWritable, VectorWritable similarityVector, Context ctx)
+ throws IOException, InterruptedException {
+
+ int itemIDIndex = itemIDIndexWritable.get();
+
+ TopSimilarItemsQueue topKMostSimilarItems = new TopSimilarItemsQueue(maxSimilarItemsPerItem);
+
+ for (Vector.Element element : similarityVector.get().nonZeroes()) {
+ SimilarItem top = topKMostSimilarItems.top();
+ double candidateSimilarity = element.get();
+ if (candidateSimilarity > top.getSimilarity()) {
+ top.set(indexItemIDMap.get(element.index()), candidateSimilarity);
+ topKMostSimilarItems.updateTop();
+ }
+ }
+
+ long itemID = indexItemIDMap.get(itemIDIndex);
+ for (SimilarItem similarItem : topKMostSimilarItems.getTopItems()) {
+ long otherItemID = similarItem.getItemID();
+ if (itemID < otherItemID) {
+ ctx.write(new EntityEntityWritable(itemID, otherItemID), new DoubleWritable(similarItem.getSimilarity()));
+ } else {
+ ctx.write(new EntityEntityWritable(otherItemID, itemID), new DoubleWritable(similarItem.getSimilarity()));
+ }
+ }
+ }
+ }
+
+ public static class MostSimilarItemPairsReducer
+ extends Reducer<EntityEntityWritable,DoubleWritable,EntityEntityWritable,DoubleWritable> {
+ @Override
+ protected void reduce(EntityEntityWritable pair, Iterable<DoubleWritable> values, Context ctx)
+ throws IOException, InterruptedException {
+ ctx.write(pair, values.iterator().next());
+ }
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/TopSimilarItemsQueue.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/TopSimilarItemsQueue.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/TopSimilarItemsQueue.java
new file mode 100644
index 0000000..acb6392
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/TopSimilarItemsQueue.java
@@ -0,0 +1,60 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.similarity.item;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+
+import org.apache.lucene.util.PriorityQueue;
+import org.apache.mahout.cf.taste.similarity.precompute.SimilarItem;
+
+public class TopSimilarItemsQueue extends PriorityQueue<SimilarItem> {
+
+ private static final long SENTINEL_ID = Long.MIN_VALUE;
+
+ private final int maxSize;
+
+ public TopSimilarItemsQueue(int maxSize) {
+ super(maxSize);
+ this.maxSize = maxSize;
+ }
+
+ public List<SimilarItem> getTopItems() {
+ List<SimilarItem> items = new ArrayList<>(maxSize);
+ while (size() > 0) {
+ SimilarItem topItem = pop();
+ // filter out "sentinel" objects necessary for maintaining an efficient priority queue
+ if (topItem.getItemID() != SENTINEL_ID) {
+ items.add(topItem);
+ }
+ }
+ Collections.reverse(items);
+ return items;
+ }
+
+ @Override
+ protected boolean lessThan(SimilarItem one, SimilarItem two) {
+ return one.getSimilarity() < two.getSimilarity();
+ }
+
+ @Override
+ protected SimilarItem getSentinelObject() {
+ return new SimilarItem(SENTINEL_ID, Double.MIN_VALUE);
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/AbstractLongPrimitiveIterator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/AbstractLongPrimitiveIterator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/AbstractLongPrimitiveIterator.java
new file mode 100644
index 0000000..f46785c
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/AbstractLongPrimitiveIterator.java
@@ -0,0 +1,27 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.common;
+
+public abstract class AbstractLongPrimitiveIterator implements LongPrimitiveIterator {
+
+ @Override
+ public Long next() {
+ return nextLong();
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/BitSet.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/BitSet.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/BitSet.java
new file mode 100644
index 0000000..c46b4b6
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/BitSet.java
@@ -0,0 +1,93 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.common;
+
+import java.io.Serializable;
+import java.util.Arrays;
+
+/** A simplified and streamlined version of {@link java.util.BitSet}. */
+final class BitSet implements Serializable, Cloneable {
+
+ private final long[] bits;
+
+ BitSet(int numBits) {
+ int numLongs = numBits >>> 6;
+ if ((numBits & 0x3F) != 0) {
+ numLongs++;
+ }
+ bits = new long[numLongs];
+ }
+
+ private BitSet(long[] bits) {
+ this.bits = bits;
+ }
+
+ boolean get(int index) {
+ // skipping range check for speed
+ return (bits[index >>> 6] & 1L << (index & 0x3F)) != 0L;
+ }
+
+ void set(int index) {
+ // skipping range check for speed
+ bits[index >>> 6] |= 1L << (index & 0x3F);
+ }
+
+ void clear(int index) {
+ // skipping range check for speed
+ bits[index >>> 6] &= ~(1L << (index & 0x3F));
+ }
+
+ void clear() {
+ int length = bits.length;
+ for (int i = 0; i < length; i++) {
+ bits[i] = 0L;
+ }
+ }
+
+ @Override
+ public BitSet clone() {
+ return new BitSet(bits.clone());
+ }
+
+ @Override
+ public int hashCode() {
+ return Arrays.hashCode(bits);
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (!(o instanceof BitSet)) {
+ return false;
+ }
+ BitSet other = (BitSet) o;
+ return Arrays.equals(bits, other.bits);
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder result = new StringBuilder(64 * bits.length);
+ for (long l : bits) {
+ for (int j = 0; j < 64; j++) {
+ result.append((l & 1L << j) == 0 ? '0' : '1');
+ }
+ result.append(' ');
+ }
+ return result.toString();
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/Cache.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/Cache.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/Cache.java
new file mode 100755
index 0000000..b2d9b36
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/Cache.java
@@ -0,0 +1,178 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.common;
+
+import com.google.common.base.Preconditions;
+import org.apache.mahout.cf.taste.common.TasteException;
+
+import java.util.Iterator;
+
+/**
+ * <p>
+ * An efficient Map-like class which caches values for keys. Values are not "put" into a {@link Cache};
+ * instead the caller supplies the instance with an implementation of {@link Retriever} which can load the
+ * value for a given key.
+ * </p>
+ *
+ * <p>
+ * The cache does not support {@code null} keys.
+ * </p>
+ *
+ * <p>
+ * Thanks to Amila Jayasooriya for helping evaluate performance of the rewrite of this class, as part of a
+ * Google Summer of Code 2007 project.
+ * </p>
+ */
+public final class Cache<K,V> implements Retriever<K,V> {
+
+ private static final Object NULL = new Object();
+
+ private final FastMap<K,V> cache;
+ private final Retriever<? super K,? extends V> retriever;
+
+ /**
+ * <p>
+ * Creates a new cache based on the given {@link Retriever}.
+ * </p>
+ *
+ * @param retriever
+ * object which can retrieve values for keys
+ */
+ public Cache(Retriever<? super K,? extends V> retriever) {
+ this(retriever, FastMap.NO_MAX_SIZE);
+ }
+
+ /**
+ * <p>
+ * Creates a new cache based on the given {@link Retriever} and with given maximum size.
+ * </p>
+ *
+ * @param retriever
+ * object which can retrieve values for keys
+ * @param maxEntries
+ * maximum number of entries the cache will store before evicting some
+ */
+ public Cache(Retriever<? super K,? extends V> retriever, int maxEntries) {
+ Preconditions.checkArgument(retriever != null, "retriever is null");
+ Preconditions.checkArgument(maxEntries >= 1, "maxEntries must be at least 1");
+ cache = new FastMap<>(11, maxEntries);
+ this.retriever = retriever;
+ }
+
+ /**
+ * <p>
+ * Returns cached value for a key. If it does not exist, it is loaded using a {@link Retriever}.
+ * </p>
+ *
+ * @param key
+ * cache key
+ * @return value for that key
+ * @throws TasteException
+ * if an exception occurs while retrieving a new cached value
+ */
+ @Override
+ public V get(K key) throws TasteException {
+ V value;
+ synchronized (cache) {
+ value = cache.get(key);
+ }
+ if (value == null) {
+ return getAndCacheValue(key);
+ }
+ return value == NULL ? null : value;
+ }
+
+ /**
+ * <p>
+ * Uncaches any existing value for a given key.
+ * </p>
+ *
+ * @param key
+ * cache key
+ */
+ public void remove(K key) {
+ synchronized (cache) {
+ cache.remove(key);
+ }
+ }
+
+ /**
+ * Clears all cache entries whose key matches the given predicate.
+ */
+ public void removeKeysMatching(MatchPredicate<K> predicate) {
+ synchronized (cache) {
+ Iterator<K> it = cache.keySet().iterator();
+ while (it.hasNext()) {
+ K key = it.next();
+ if (predicate.matches(key)) {
+ it.remove();
+ }
+ }
+ }
+ }
+
+ /**
+ * Clears all cache entries whose value matches the given predicate.
+ */
+ public void removeValueMatching(MatchPredicate<V> predicate) {
+ synchronized (cache) {
+ Iterator<V> it = cache.values().iterator();
+ while (it.hasNext()) {
+ V value = it.next();
+ if (predicate.matches(value)) {
+ it.remove();
+ }
+ }
+ }
+ }
+
+ /**
+ * <p>
+ * Clears the cache.
+ * </p>
+ */
+ public void clear() {
+ synchronized (cache) {
+ cache.clear();
+ }
+ }
+
+ private V getAndCacheValue(K key) throws TasteException {
+ V value = retriever.get(key);
+ if (value == null) {
+ value = (V) NULL;
+ }
+ synchronized (cache) {
+ cache.put(key, value);
+ }
+ return value;
+ }
+
+ @Override
+ public String toString() {
+ return "Cache[retriever:" + retriever + ']';
+ }
+
+ /**
+ * Used by {#link #removeKeysMatching(Object)} to decide things that are matching.
+ */
+ public interface MatchPredicate<T> {
+ boolean matches(T thing);
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/FastByIDMap.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/FastByIDMap.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/FastByIDMap.java
new file mode 100644
index 0000000..fde8958
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/FastByIDMap.java
@@ -0,0 +1,661 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.common;
+
+import java.io.Serializable;
+import java.util.AbstractCollection;
+import java.util.AbstractSet;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.NoSuchElementException;
+import java.util.Set;
+
+import org.apache.mahout.common.RandomUtils;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * @see FastMap
+ * @see FastIDSet
+ */
+public final class FastByIDMap<V> implements Serializable, Cloneable {
+
+ public static final int NO_MAX_SIZE = Integer.MAX_VALUE;
+ private static final float DEFAULT_LOAD_FACTOR = 1.5f;
+
+ /** Dummy object used to represent a key that has been removed. */
+ private static final long REMOVED = Long.MAX_VALUE;
+ private static final long NULL = Long.MIN_VALUE;
+
+ private long[] keys;
+ private V[] values;
+ private float loadFactor;
+ private int numEntries;
+ private int numSlotsUsed;
+ private final int maxSize;
+ private BitSet recentlyAccessed;
+ private final boolean countingAccesses;
+
+ /** Creates a new {@link FastByIDMap} with default capacity. */
+ public FastByIDMap() {
+ this(2, NO_MAX_SIZE);
+ }
+
+ public FastByIDMap(int size) {
+ this(size, NO_MAX_SIZE);
+ }
+
+ public FastByIDMap(int size, float loadFactor) {
+ this(size, NO_MAX_SIZE, loadFactor);
+ }
+
+ public FastByIDMap(int size, int maxSize) {
+ this(size, maxSize, DEFAULT_LOAD_FACTOR);
+ }
+
+ /**
+ * Creates a new {@link FastByIDMap} whose capacity can accommodate the given number of entries without rehash.
+ *
+ * @param size desired capacity
+ * @param maxSize max capacity
+ * @param loadFactor ratio of internal hash table size to current size
+ * @throws IllegalArgumentException if size is less than 0, maxSize is less than 1
+ * or at least half of {@link RandomUtils#MAX_INT_SMALLER_TWIN_PRIME}, or
+ * loadFactor is less than 1
+ */
+ public FastByIDMap(int size, int maxSize, float loadFactor) {
+ Preconditions.checkArgument(size >= 0, "size must be at least 0");
+ Preconditions.checkArgument(loadFactor >= 1.0f, "loadFactor must be at least 1.0");
+ this.loadFactor = loadFactor;
+ int max = (int) (RandomUtils.MAX_INT_SMALLER_TWIN_PRIME / loadFactor);
+ Preconditions.checkArgument(size < max, "size must be less than " + max);
+ Preconditions.checkArgument(maxSize >= 1, "maxSize must be at least 1");
+ int hashSize = RandomUtils.nextTwinPrime((int) (loadFactor * size));
+ keys = new long[hashSize];
+ Arrays.fill(keys, NULL);
+ values = (V[]) new Object[hashSize];
+ this.maxSize = maxSize;
+ this.countingAccesses = maxSize != Integer.MAX_VALUE;
+ this.recentlyAccessed = countingAccesses ? new BitSet(hashSize) : null;
+ }
+
+ /**
+ * @see #findForAdd(long)
+ */
+ private int find(long key) {
+ int theHashCode = (int) key & 0x7FFFFFFF; // make sure it's positive
+ long[] keys = this.keys;
+ int hashSize = keys.length;
+ int jump = 1 + theHashCode % (hashSize - 2);
+ int index = theHashCode % hashSize;
+ long currentKey = keys[index];
+ while (currentKey != NULL && key != currentKey) {
+ index -= index < jump ? jump - hashSize : jump;
+ currentKey = keys[index];
+ }
+ return index;
+ }
+
+ /**
+ * @see #find(long)
+ */
+ private int findForAdd(long key) {
+ int theHashCode = (int) key & 0x7FFFFFFF; // make sure it's positive
+ long[] keys = this.keys;
+ int hashSize = keys.length;
+ int jump = 1 + theHashCode % (hashSize - 2);
+ int index = theHashCode % hashSize;
+ long currentKey = keys[index];
+ while (currentKey != NULL && currentKey != REMOVED && key != currentKey) {
+ index -= index < jump ? jump - hashSize : jump;
+ currentKey = keys[index];
+ }
+ if (currentKey != REMOVED) {
+ return index;
+ }
+ // If we're adding, it's here, but, the key might have a value already later
+ int addIndex = index;
+ while (currentKey != NULL && key != currentKey) {
+ index -= index < jump ? jump - hashSize : jump;
+ currentKey = keys[index];
+ }
+ return key == currentKey ? index : addIndex;
+ }
+
+ public V get(long key) {
+ if (key == NULL) {
+ return null;
+ }
+ int index = find(key);
+ if (countingAccesses) {
+ recentlyAccessed.set(index);
+ }
+ return values[index];
+ }
+
+ public int size() {
+ return numEntries;
+ }
+
+ public boolean isEmpty() {
+ return numEntries == 0;
+ }
+
+ public boolean containsKey(long key) {
+ return key != NULL && key != REMOVED && keys[find(key)] != NULL;
+ }
+
+ public boolean containsValue(Object value) {
+ if (value == null) {
+ return false;
+ }
+ for (V theValue : values) {
+ if (theValue != null && value.equals(theValue)) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ public V put(long key, V value) {
+ Preconditions.checkArgument(key != NULL && key != REMOVED);
+ Preconditions.checkNotNull(value);
+ // If less than half the slots are open, let's clear it up
+ if (numSlotsUsed * loadFactor >= keys.length) {
+ // If over half the slots used are actual entries, let's grow
+ if (numEntries * loadFactor >= numSlotsUsed) {
+ growAndRehash();
+ } else {
+ // Otherwise just rehash to clear REMOVED entries and don't grow
+ rehash();
+ }
+ }
+ // Here we may later consider implementing Brent's variation described on page 532
+ int index = findForAdd(key);
+ long keyIndex = keys[index];
+ if (keyIndex == key) {
+ V oldValue = values[index];
+ values[index] = value;
+ return oldValue;
+ }
+ // If size is limited,
+ if (countingAccesses && numEntries >= maxSize) {
+ // and we're too large, clear some old-ish entry
+ clearStaleEntry(index);
+ }
+ keys[index] = key;
+ values[index] = value;
+ numEntries++;
+ if (keyIndex == NULL) {
+ numSlotsUsed++;
+ }
+ return null;
+ }
+
+ private void clearStaleEntry(int index) {
+ while (true) {
+ long currentKey;
+ do {
+ if (index == 0) {
+ index = keys.length - 1;
+ } else {
+ index--;
+ }
+ currentKey = keys[index];
+ } while (currentKey == NULL || currentKey == REMOVED);
+ if (recentlyAccessed.get(index)) {
+ recentlyAccessed.clear(index);
+ } else {
+ break;
+ }
+ }
+ // Delete the entry
+ keys[index] = REMOVED;
+ numEntries--;
+ values[index] = null;
+ }
+
+ public V remove(long key) {
+ if (key == NULL || key == REMOVED) {
+ return null;
+ }
+ int index = find(key);
+ if (keys[index] == NULL) {
+ return null;
+ } else {
+ keys[index] = REMOVED;
+ numEntries--;
+ V oldValue = values[index];
+ values[index] = null;
+ // don't decrement numSlotsUsed
+ return oldValue;
+ }
+ // Could un-set recentlyAccessed's bit but doesn't matter
+ }
+
+ public void clear() {
+ numEntries = 0;
+ numSlotsUsed = 0;
+ Arrays.fill(keys, NULL);
+ Arrays.fill(values, null);
+ if (countingAccesses) {
+ recentlyAccessed.clear();
+ }
+ }
+
+ public LongPrimitiveIterator keySetIterator() {
+ return new KeyIterator();
+ }
+
+ public Set<Map.Entry<Long,V>> entrySet() {
+ return new EntrySet();
+ }
+
+ public Collection<V> values() {
+ return new ValueCollection();
+ }
+
+ public void rehash() {
+ rehash(RandomUtils.nextTwinPrime((int) (loadFactor * numEntries)));
+ }
+
+ private void growAndRehash() {
+ if (keys.length * loadFactor >= RandomUtils.MAX_INT_SMALLER_TWIN_PRIME) {
+ throw new IllegalStateException("Can't grow any more");
+ }
+ rehash(RandomUtils.nextTwinPrime((int) (loadFactor * keys.length)));
+ }
+
+ private void rehash(int newHashSize) {
+ long[] oldKeys = keys;
+ V[] oldValues = values;
+ numEntries = 0;
+ numSlotsUsed = 0;
+ if (countingAccesses) {
+ recentlyAccessed = new BitSet(newHashSize);
+ }
+ keys = new long[newHashSize];
+ Arrays.fill(keys, NULL);
+ values = (V[]) new Object[newHashSize];
+ int length = oldKeys.length;
+ for (int i = 0; i < length; i++) {
+ long key = oldKeys[i];
+ if (key != NULL && key != REMOVED) {
+ put(key, oldValues[i]);
+ }
+ }
+ }
+
+ void iteratorRemove(int lastNext) {
+ if (lastNext >= values.length) {
+ throw new NoSuchElementException();
+ }
+ if (lastNext < 0) {
+ throw new IllegalStateException();
+ }
+ values[lastNext] = null;
+ keys[lastNext] = REMOVED;
+ numEntries--;
+ }
+
+ @Override
+ public FastByIDMap<V> clone() {
+ FastByIDMap<V> clone;
+ try {
+ clone = (FastByIDMap<V>) super.clone();
+ } catch (CloneNotSupportedException cnse) {
+ throw new AssertionError();
+ }
+ clone.keys = keys.clone();
+ clone.values = values.clone();
+ clone.recentlyAccessed = countingAccesses ? new BitSet(keys.length) : null;
+ return clone;
+ }
+
+ @Override
+ public String toString() {
+ if (isEmpty()) {
+ return "{}";
+ }
+ StringBuilder result = new StringBuilder();
+ result.append('{');
+ for (int i = 0; i < keys.length; i++) {
+ long key = keys[i];
+ if (key != NULL && key != REMOVED) {
+ result.append(key).append('=').append(values[i]).append(',');
+ }
+ }
+ result.setCharAt(result.length() - 1, '}');
+ return result.toString();
+ }
+
+ @Override
+ public int hashCode() {
+ int hash = 0;
+ long[] keys = this.keys;
+ int max = keys.length;
+ for (int i = 0; i < max; i++) {
+ long key = keys[i];
+ if (key != NULL && key != REMOVED) {
+ hash = 31 * hash + ((int) (key >> 32) ^ (int) key);
+ hash = 31 * hash + values[i].hashCode();
+ }
+ }
+ return hash;
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ if (!(other instanceof FastByIDMap)) {
+ return false;
+ }
+ FastByIDMap<V> otherMap = (FastByIDMap<V>) other;
+ long[] otherKeys = otherMap.keys;
+ V[] otherValues = otherMap.values;
+ int length = keys.length;
+ int otherLength = otherKeys.length;
+ int max = Math.min(length, otherLength);
+
+ int i = 0;
+ while (i < max) {
+ long key = keys[i];
+ long otherKey = otherKeys[i];
+ if (key == NULL || key == REMOVED) {
+ if (otherKey != NULL && otherKey != REMOVED) {
+ return false;
+ }
+ } else {
+ if (key != otherKey || !values[i].equals(otherValues[i])) {
+ return false;
+ }
+ }
+ i++;
+ }
+ while (i < length) {
+ long key = keys[i];
+ if (key != NULL && key != REMOVED) {
+ return false;
+ }
+ i++;
+ }
+ while (i < otherLength) {
+ long key = otherKeys[i];
+ if (key != NULL && key != REMOVED) {
+ return false;
+ }
+ i++;
+ }
+ return true;
+ }
+
+ private final class KeyIterator extends AbstractLongPrimitiveIterator {
+
+ private int position;
+ private int lastNext = -1;
+
+ @Override
+ public boolean hasNext() {
+ goToNext();
+ return position < keys.length;
+ }
+
+ @Override
+ public long nextLong() {
+ goToNext();
+ lastNext = position;
+ if (position >= keys.length) {
+ throw new NoSuchElementException();
+ }
+ return keys[position++];
+ }
+
+ @Override
+ public long peek() {
+ goToNext();
+ if (position >= keys.length) {
+ throw new NoSuchElementException();
+ }
+ return keys[position];
+ }
+
+ private void goToNext() {
+ int length = values.length;
+ while (position < length && values[position] == null) {
+ position++;
+ }
+ }
+
+ @Override
+ public void remove() {
+ iteratorRemove(lastNext);
+ }
+
+ @Override
+ public void skip(int n) {
+ position += n;
+ }
+
+ }
+
+ private final class EntrySet extends AbstractSet<Map.Entry<Long,V>> {
+
+ @Override
+ public int size() {
+ return FastByIDMap.this.size();
+ }
+
+ @Override
+ public boolean isEmpty() {
+ return FastByIDMap.this.isEmpty();
+ }
+
+ @Override
+ public boolean contains(Object o) {
+ return containsKey((Long) o);
+ }
+
+ @Override
+ public Iterator<Map.Entry<Long,V>> iterator() {
+ return new EntryIterator();
+ }
+
+ @Override
+ public boolean add(Map.Entry<Long,V> t) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public boolean remove(Object o) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public boolean addAll(Collection<? extends Map.Entry<Long,V>> ts) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public boolean retainAll(Collection<?> objects) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public boolean removeAll(Collection<?> objects) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public void clear() {
+ FastByIDMap.this.clear();
+ }
+
+ private final class MapEntry implements Map.Entry<Long,V> {
+
+ private final int index;
+
+ private MapEntry(int index) {
+ this.index = index;
+ }
+
+ @Override
+ public Long getKey() {
+ return keys[index];
+ }
+
+ @Override
+ public V getValue() {
+ return values[index];
+ }
+
+ @Override
+ public V setValue(V value) {
+ Preconditions.checkArgument(value != null);
+
+ V oldValue = values[index];
+ values[index] = value;
+ return oldValue;
+ }
+ }
+
+ private final class EntryIterator implements Iterator<Map.Entry<Long,V>> {
+
+ private int position;
+ private int lastNext = -1;
+
+ @Override
+ public boolean hasNext() {
+ goToNext();
+ return position < keys.length;
+ }
+
+ @Override
+ public Map.Entry<Long,V> next() {
+ goToNext();
+ lastNext = position;
+ if (position >= keys.length) {
+ throw new NoSuchElementException();
+ }
+ return new MapEntry(position++);
+ }
+
+ private void goToNext() {
+ int length = values.length;
+ while (position < length && values[position] == null) {
+ position++;
+ }
+ }
+
+ @Override
+ public void remove() {
+ iteratorRemove(lastNext);
+ }
+ }
+
+ }
+
+ private final class ValueCollection extends AbstractCollection<V> {
+
+ @Override
+ public int size() {
+ return FastByIDMap.this.size();
+ }
+
+ @Override
+ public boolean isEmpty() {
+ return FastByIDMap.this.isEmpty();
+ }
+
+ @Override
+ public boolean contains(Object o) {
+ return containsValue(o);
+ }
+
+ @Override
+ public Iterator<V> iterator() {
+ return new ValueIterator();
+ }
+
+ @Override
+ public boolean add(V v) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public boolean remove(Object o) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public boolean addAll(Collection<? extends V> vs) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public boolean removeAll(Collection<?> objects) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public boolean retainAll(Collection<?> objects) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public void clear() {
+ FastByIDMap.this.clear();
+ }
+
+ private final class ValueIterator implements Iterator<V> {
+
+ private int position;
+ private int lastNext = -1;
+
+ @Override
+ public boolean hasNext() {
+ goToNext();
+ return position < values.length;
+ }
+
+ @Override
+ public V next() {
+ goToNext();
+ lastNext = position;
+ if (position >= values.length) {
+ throw new NoSuchElementException();
+ }
+ return values[position++];
+ }
+
+ private void goToNext() {
+ int length = values.length;
+ while (position < length && values[position] == null) {
+ position++;
+ }
+ }
+
+ @Override
+ public void remove() {
+ iteratorRemove(lastNext);
+ }
+
+ }
+
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/FastIDSet.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/FastIDSet.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/FastIDSet.java
new file mode 100644
index 0000000..5908270
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/FastIDSet.java
@@ -0,0 +1,426 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.common;
+
+import java.io.Serializable;
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.NoSuchElementException;
+
+import org.apache.mahout.common.RandomUtils;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * @see FastByIDMap
+ */
+public final class FastIDSet implements Serializable, Cloneable, Iterable<Long> {
+
+ private static final float DEFAULT_LOAD_FACTOR = 1.5f;
+
+ /** Dummy object used to represent a key that has been removed. */
+ private static final long REMOVED = Long.MAX_VALUE;
+ private static final long NULL = Long.MIN_VALUE;
+
+ private long[] keys;
+ private float loadFactor;
+ private int numEntries;
+ private int numSlotsUsed;
+
+ /** Creates a new {@link FastIDSet} with default capacity. */
+ public FastIDSet() {
+ this(2);
+ }
+
+ public FastIDSet(long[] initialKeys) {
+ this(initialKeys.length);
+ addAll(initialKeys);
+ }
+
+ public FastIDSet(int size) {
+ this(size, DEFAULT_LOAD_FACTOR);
+ }
+
+ public FastIDSet(int size, float loadFactor) {
+ Preconditions.checkArgument(size >= 0, "size must be at least 0");
+ Preconditions.checkArgument(loadFactor >= 1.0f, "loadFactor must be at least 1.0");
+ this.loadFactor = loadFactor;
+ int max = (int) (RandomUtils.MAX_INT_SMALLER_TWIN_PRIME / loadFactor);
+ Preconditions.checkArgument(size < max, "size must be less than %d", max);
+ int hashSize = RandomUtils.nextTwinPrime((int) (loadFactor * size));
+ keys = new long[hashSize];
+ Arrays.fill(keys, NULL);
+ }
+
+ /**
+ * @see #findForAdd(long)
+ */
+ private int find(long key) {
+ int theHashCode = (int) key & 0x7FFFFFFF; // make sure it's positive
+ long[] keys = this.keys;
+ int hashSize = keys.length;
+ int jump = 1 + theHashCode % (hashSize - 2);
+ int index = theHashCode % hashSize;
+ long currentKey = keys[index];
+ while (currentKey != NULL && key != currentKey) { // note: true when currentKey == REMOVED
+ index -= index < jump ? jump - hashSize : jump;
+ currentKey = keys[index];
+ }
+ return index;
+ }
+
+ /**
+ * @see #find(long)
+ */
+ private int findForAdd(long key) {
+ int theHashCode = (int) key & 0x7FFFFFFF; // make sure it's positive
+ long[] keys = this.keys;
+ int hashSize = keys.length;
+ int jump = 1 + theHashCode % (hashSize - 2);
+ int index = theHashCode % hashSize;
+ long currentKey = keys[index];
+ while (currentKey != NULL && currentKey != REMOVED && key != currentKey) {
+ index -= index < jump ? jump - hashSize : jump;
+ currentKey = keys[index];
+ }
+ if (currentKey != REMOVED) {
+ return index;
+ }
+ // If we're adding, it's here, but, the key might have a value already later
+ int addIndex = index;
+ while (currentKey != NULL && key != currentKey) {
+ index -= index < jump ? jump - hashSize : jump;
+ currentKey = keys[index];
+ }
+ return key == currentKey ? index : addIndex;
+ }
+
+ public int size() {
+ return numEntries;
+ }
+
+ public boolean isEmpty() {
+ return numEntries == 0;
+ }
+
+ public boolean contains(long key) {
+ return key != NULL && key != REMOVED && keys[find(key)] != NULL;
+ }
+
+ public boolean add(long key) {
+ Preconditions.checkArgument(key != NULL && key != REMOVED);
+
+ // If less than half the slots are open, let's clear it up
+ if (numSlotsUsed * loadFactor >= keys.length) {
+ // If over half the slots used are actual entries, let's grow
+ if (numEntries * loadFactor >= numSlotsUsed) {
+ growAndRehash();
+ } else {
+ // Otherwise just rehash to clear REMOVED entries and don't grow
+ rehash();
+ }
+ }
+ // Here we may later consider implementing Brent's variation described on page 532
+ int index = findForAdd(key);
+ long keyIndex = keys[index];
+ if (keyIndex != key) {
+ keys[index] = key;
+ numEntries++;
+ if (keyIndex == NULL) {
+ numSlotsUsed++;
+ }
+ return true;
+ }
+ return false;
+ }
+
+ @Override
+ public LongPrimitiveIterator iterator() {
+ return new KeyIterator();
+ }
+
+ public long[] toArray() {
+ long[] result = new long[numEntries];
+ for (int i = 0, position = 0; i < result.length; i++) {
+ while (keys[position] == NULL || keys[position] == REMOVED) {
+ position++;
+ }
+ result[i] = keys[position++];
+ }
+ return result;
+ }
+
+ public boolean remove(long key) {
+ if (key == NULL || key == REMOVED) {
+ return false;
+ }
+ int index = find(key);
+ if (keys[index] == NULL) {
+ return false;
+ } else {
+ keys[index] = REMOVED;
+ numEntries--;
+ return true;
+ }
+ }
+
+ public boolean addAll(long[] c) {
+ boolean changed = false;
+ for (long k : c) {
+ if (add(k)) {
+ changed = true;
+ }
+ }
+ return changed;
+ }
+
+ public boolean addAll(FastIDSet c) {
+ boolean changed = false;
+ for (long k : c.keys) {
+ if (k != NULL && k != REMOVED && add(k)) {
+ changed = true;
+ }
+ }
+ return changed;
+ }
+
+ public boolean removeAll(long[] c) {
+ boolean changed = false;
+ for (long o : c) {
+ if (remove(o)) {
+ changed = true;
+ }
+ }
+ return changed;
+ }
+
+ public boolean removeAll(FastIDSet c) {
+ boolean changed = false;
+ for (long k : c.keys) {
+ if (k != NULL && k != REMOVED && remove(k)) {
+ changed = true;
+ }
+ }
+ return changed;
+ }
+
+ public boolean retainAll(FastIDSet c) {
+ boolean changed = false;
+ for (int i = 0; i < keys.length; i++) {
+ long k = keys[i];
+ if (k != NULL && k != REMOVED && !c.contains(k)) {
+ keys[i] = REMOVED;
+ numEntries--;
+ changed = true;
+ }
+ }
+ return changed;
+ }
+
+ public void clear() {
+ numEntries = 0;
+ numSlotsUsed = 0;
+ Arrays.fill(keys, NULL);
+ }
+
+ private void growAndRehash() {
+ if (keys.length * loadFactor >= RandomUtils.MAX_INT_SMALLER_TWIN_PRIME) {
+ throw new IllegalStateException("Can't grow any more");
+ }
+ rehash(RandomUtils.nextTwinPrime((int) (loadFactor * keys.length)));
+ }
+
+ public void rehash() {
+ rehash(RandomUtils.nextTwinPrime((int) (loadFactor * numEntries)));
+ }
+
+ private void rehash(int newHashSize) {
+ long[] oldKeys = keys;
+ numEntries = 0;
+ numSlotsUsed = 0;
+ keys = new long[newHashSize];
+ Arrays.fill(keys, NULL);
+ for (long key : oldKeys) {
+ if (key != NULL && key != REMOVED) {
+ add(key);
+ }
+ }
+ }
+
+ /**
+ * Convenience method to quickly compute just the size of the intersection with another {@link FastIDSet}.
+ *
+ * @param other
+ * {@link FastIDSet} to intersect with
+ * @return number of elements in intersection
+ */
+ public int intersectionSize(FastIDSet other) {
+ int count = 0;
+ for (long key : other.keys) {
+ if (key != NULL && key != REMOVED && keys[find(key)] != NULL) {
+ count++;
+ }
+ }
+ return count;
+ }
+
+ @Override
+ public FastIDSet clone() {
+ FastIDSet clone;
+ try {
+ clone = (FastIDSet) super.clone();
+ } catch (CloneNotSupportedException cnse) {
+ throw new AssertionError();
+ }
+ clone.keys = keys.clone();
+ return clone;
+ }
+
+ @Override
+ public int hashCode() {
+ int hash = 0;
+ long[] keys = this.keys;
+ for (long key : keys) {
+ if (key != NULL && key != REMOVED) {
+ hash = 31 * hash + ((int) (key >> 32) ^ (int) key);
+ }
+ }
+ return hash;
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ if (!(other instanceof FastIDSet)) {
+ return false;
+ }
+ FastIDSet otherMap = (FastIDSet) other;
+ long[] otherKeys = otherMap.keys;
+ int length = keys.length;
+ int otherLength = otherKeys.length;
+ int max = Math.min(length, otherLength);
+
+ int i = 0;
+ while (i < max) {
+ long key = keys[i];
+ long otherKey = otherKeys[i];
+ if (key == NULL || key == REMOVED) {
+ if (otherKey != NULL && otherKey != REMOVED) {
+ return false;
+ }
+ } else {
+ if (key != otherKey) {
+ return false;
+ }
+ }
+ i++;
+ }
+ while (i < length) {
+ long key = keys[i];
+ if (key != NULL && key != REMOVED) {
+ return false;
+ }
+ i++;
+ }
+ while (i < otherLength) {
+ long key = otherKeys[i];
+ if (key != NULL && key != REMOVED) {
+ return false;
+ }
+ i++;
+ }
+ return true;
+ }
+
+ @Override
+ public String toString() {
+ if (isEmpty()) {
+ return "[]";
+ }
+ StringBuilder result = new StringBuilder();
+ result.append('[');
+ for (long key : keys) {
+ if (key != NULL && key != REMOVED) {
+ result.append(key).append(',');
+ }
+ }
+ result.setCharAt(result.length() - 1, ']');
+ return result.toString();
+ }
+
+ private final class KeyIterator extends AbstractLongPrimitiveIterator {
+
+ private int position;
+ private int lastNext = -1;
+
+ @Override
+ public boolean hasNext() {
+ goToNext();
+ return position < keys.length;
+ }
+
+ @Override
+ public long nextLong() {
+ goToNext();
+ lastNext = position;
+ if (position >= keys.length) {
+ throw new NoSuchElementException();
+ }
+ return keys[position++];
+ }
+
+ @Override
+ public long peek() {
+ goToNext();
+ if (position >= keys.length) {
+ throw new NoSuchElementException();
+ }
+ return keys[position];
+ }
+
+ private void goToNext() {
+ int length = keys.length;
+ while (position < length
+ && (keys[position] == NULL || keys[position] == REMOVED)) {
+ position++;
+ }
+ }
+
+ @Override
+ public void remove() {
+ if (lastNext >= keys.length) {
+ throw new NoSuchElementException();
+ }
+ if (lastNext < 0) {
+ throw new IllegalStateException();
+ }
+ keys[lastNext] = REMOVED;
+ numEntries--;
+ }
+
+ public Iterator<Long> iterator() {
+ return new KeyIterator();
+ }
+
+ @Override
+ public void skip(int n) {
+ position += n;
+ }
+
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/FastMap.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/FastMap.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/FastMap.java
new file mode 100644
index 0000000..7c64b44
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/FastMap.java
@@ -0,0 +1,729 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.common;
+
+import java.io.Serializable;
+import java.util.AbstractCollection;
+import java.util.AbstractSet;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.NoSuchElementException;
+import java.util.Set;
+
+import org.apache.mahout.common.RandomUtils;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * <p>
+ * This is an optimized {@link Map} implementation, based on algorithms described in Knuth's "Art of Computer
+ * Programming", Vol. 3, p. 529.
+ * </p>
+ *
+ * <p>
+ * It should be faster than {@link java.util.HashMap} in some cases, but not all. Its main feature is a
+ * "max size" and the ability to transparently, efficiently and semi-intelligently evict old entries when max
+ * size is exceeded.
+ * </p>
+ *
+ * <p>
+ * This class is not a bit thread-safe.
+ * </p>
+ *
+ * <p>
+ * This implementation does not allow {@code null} as a key or value.
+ * </p>
+ */
+public final class FastMap<K,V> implements Map<K,V>, Serializable, Cloneable {
+
+ public static final int NO_MAX_SIZE = Integer.MAX_VALUE;
+ private static final float DEFAULT_LOAD_FACTOR = 1.5f;
+
+ /** Dummy object used to represent a key that has been removed. */
+ private static final Object REMOVED = new Object();
+
+ private K[] keys;
+ private V[] values;
+ private float loadFactor;
+ private int numEntries;
+ private int numSlotsUsed;
+ private final int maxSize;
+ private BitSet recentlyAccessed;
+ private final boolean countingAccesses;
+
+ /** Creates a new {@link FastMap} with default capacity. */
+ public FastMap() {
+ this(2, NO_MAX_SIZE);
+ }
+
+ public FastMap(int size) {
+ this(size, NO_MAX_SIZE);
+ }
+
+ public FastMap(Map<K,V> other) {
+ this(other.size());
+ putAll(other);
+ }
+
+ public FastMap(int size, float loadFactor) {
+ this(size, NO_MAX_SIZE, loadFactor);
+ }
+
+ public FastMap(int size, int maxSize) {
+ this(size, maxSize, DEFAULT_LOAD_FACTOR);
+ }
+
+ /**
+ * Creates a new whose capacity can accommodate the given number of entries without rehash.
+ *
+ * @param size desired capacity
+ * @param maxSize max capacity
+ * @throws IllegalArgumentException if size is less than 0, maxSize is less than 1
+ * or at least half of {@link RandomUtils#MAX_INT_SMALLER_TWIN_PRIME}, or
+ * loadFactor is less than 1
+ */
+ public FastMap(int size, int maxSize, float loadFactor) {
+ Preconditions.checkArgument(size >= 0, "size must be at least 0");
+ Preconditions.checkArgument(loadFactor >= 1.0f, "loadFactor must be at least 1.0");
+ this.loadFactor = loadFactor;
+ int max = (int) (RandomUtils.MAX_INT_SMALLER_TWIN_PRIME / loadFactor);
+ Preconditions.checkArgument(size < max, "size must be less than " + max);
+ Preconditions.checkArgument(maxSize >= 1, "maxSize must be at least 1");
+ int hashSize = RandomUtils.nextTwinPrime((int) (loadFactor * size));
+ keys = (K[]) new Object[hashSize];
+ values = (V[]) new Object[hashSize];
+ this.maxSize = maxSize;
+ this.countingAccesses = maxSize != Integer.MAX_VALUE;
+ this.recentlyAccessed = countingAccesses ? new BitSet(hashSize) : null;
+ }
+
+ private int find(Object key) {
+ int theHashCode = key.hashCode() & 0x7FFFFFFF; // make sure it's positive
+ K[] keys = this.keys;
+ int hashSize = keys.length;
+ int jump = 1 + theHashCode % (hashSize - 2);
+ int index = theHashCode % hashSize;
+ K currentKey = keys[index];
+ while (currentKey != null && !key.equals(currentKey)) {
+ index -= index < jump ? jump - hashSize : jump;
+ currentKey = keys[index];
+ }
+ return index;
+ }
+
+ private int findForAdd(Object key) {
+ int theHashCode = key.hashCode() & 0x7FFFFFFF; // make sure it's positive
+ K[] keys = this.keys;
+ int hashSize = keys.length;
+ int jump = 1 + theHashCode % (hashSize - 2);
+ int index = theHashCode % hashSize;
+ K currentKey = keys[index];
+ while (currentKey != null && currentKey != REMOVED && key != currentKey) {
+ index -= index < jump ? jump - hashSize : jump;
+ currentKey = keys[index];
+ }
+ if (currentKey != REMOVED) {
+ return index;
+ }
+ // If we're adding, it's here, but, the key might have a value already later
+ int addIndex = index;
+ while (currentKey != null && key != currentKey) {
+ index -= index < jump ? jump - hashSize : jump;
+ currentKey = keys[index];
+ }
+ return key == currentKey ? index : addIndex;
+ }
+
+ @Override
+ public V get(Object key) {
+ if (key == null) {
+ return null;
+ }
+ int index = find(key);
+ if (countingAccesses) {
+ recentlyAccessed.set(index);
+ }
+ return values[index];
+ }
+
+ @Override
+ public int size() {
+ return numEntries;
+ }
+
+ @Override
+ public boolean isEmpty() {
+ return numEntries == 0;
+ }
+
+ @Override
+ public boolean containsKey(Object key) {
+ return key != null && keys[find(key)] != null;
+ }
+
+ @Override
+ public boolean containsValue(Object value) {
+ if (value == null) {
+ return false;
+ }
+ for (V theValue : values) {
+ if (theValue != null && value.equals(theValue)) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /**
+ * @throws NullPointerException
+ * if key or value is null
+ */
+ @Override
+ public V put(K key, V value) {
+ Preconditions.checkNotNull(key);
+ Preconditions.checkNotNull(value);
+ // If less than half the slots are open, let's clear it up
+ if (numSlotsUsed * loadFactor >= keys.length) {
+ // If over half the slots used are actual entries, let's grow
+ if (numEntries * loadFactor >= numSlotsUsed) {
+ growAndRehash();
+ } else {
+ // Otherwise just rehash to clear REMOVED entries and don't grow
+ rehash();
+ }
+ }
+ // Here we may later consider implementing Brent's variation described on page 532
+ int index = findForAdd(key);
+ if (keys[index] == key) {
+ V oldValue = values[index];
+ values[index] = value;
+ return oldValue;
+ }
+ // If size is limited,
+ if (countingAccesses && numEntries >= maxSize) {
+ // and we're too large, clear some old-ish entry
+ clearStaleEntry(index);
+ }
+ keys[index] = key;
+ values[index] = value;
+ numEntries++;
+ numSlotsUsed++;
+ return null;
+ }
+
+ private void clearStaleEntry(int index) {
+ while (true) {
+ K currentKey;
+ do {
+ if (index == 0) {
+ index = keys.length - 1;
+ } else {
+ index--;
+ }
+ currentKey = keys[index];
+ } while (currentKey == null || currentKey == REMOVED);
+ if (recentlyAccessed.get(index)) {
+ recentlyAccessed.clear(index);
+ } else {
+ break;
+ }
+ }
+ // Delete the entry
+ ((Object[])keys)[index] = REMOVED;
+ numEntries--;
+ values[index] = null;
+ }
+
+ @Override
+ public void putAll(Map<? extends K,? extends V> map) {
+ for (Entry<? extends K,? extends V> entry : map.entrySet()) {
+ put(entry.getKey(), entry.getValue());
+ }
+ }
+
+ @Override
+ public V remove(Object key) {
+ if (key == null) {
+ return null;
+ }
+ int index = find(key);
+ if (keys[index] == null) {
+ return null;
+ } else {
+ ((Object[])keys)[index] = REMOVED;
+ numEntries--;
+ V oldValue = values[index];
+ values[index] = null;
+ // don't decrement numSlotsUsed
+ return oldValue;
+ }
+ // Could un-set recentlyAccessed's bit but doesn't matter
+ }
+
+ @Override
+ public void clear() {
+ numEntries = 0;
+ numSlotsUsed = 0;
+ Arrays.fill(keys, null);
+ Arrays.fill(values, null);
+ if (countingAccesses) {
+ recentlyAccessed.clear();
+ }
+ }
+
+ @Override
+ public Set<K> keySet() {
+ return new KeySet();
+ }
+
+ @Override
+ public Collection<V> values() {
+ return new ValueCollection();
+ }
+
+ @Override
+ public Set<Entry<K,V>> entrySet() {
+ return new EntrySet();
+ }
+
+ public void rehash() {
+ rehash(RandomUtils.nextTwinPrime((int) (loadFactor * numEntries)));
+ }
+
+ private void growAndRehash() {
+ if (keys.length * loadFactor >= RandomUtils.MAX_INT_SMALLER_TWIN_PRIME) {
+ throw new IllegalStateException("Can't grow any more");
+ }
+ rehash(RandomUtils.nextTwinPrime((int) (loadFactor * keys.length)));
+ }
+
+ private void rehash(int newHashSize) {
+ K[] oldKeys = keys;
+ V[] oldValues = values;
+ numEntries = 0;
+ numSlotsUsed = 0;
+ if (countingAccesses) {
+ recentlyAccessed = new BitSet(newHashSize);
+ }
+ keys = (K[]) new Object[newHashSize];
+ values = (V[]) new Object[newHashSize];
+ int length = oldKeys.length;
+ for (int i = 0; i < length; i++) {
+ K key = oldKeys[i];
+ if (key != null && key != REMOVED) {
+ put(key, oldValues[i]);
+ }
+ }
+ }
+
+ void iteratorRemove(int lastNext) {
+ if (lastNext >= values.length) {
+ throw new NoSuchElementException();
+ }
+ if (lastNext < 0) {
+ throw new IllegalStateException();
+ }
+ values[lastNext] = null;
+ ((Object[])keys)[lastNext] = REMOVED;
+ numEntries--;
+ }
+
+ @Override
+ public FastMap<K,V> clone() {
+ FastMap<K,V> clone;
+ try {
+ clone = (FastMap<K,V>) super.clone();
+ } catch (CloneNotSupportedException cnse) {
+ throw new AssertionError();
+ }
+ clone.keys = keys.clone();
+ clone.values = values.clone();
+ clone.recentlyAccessed = countingAccesses ? new BitSet(keys.length) : null;
+ return clone;
+ }
+
+ @Override
+ public int hashCode() {
+ int hash = 0;
+ K[] keys = this.keys;
+ int max = keys.length;
+ for (int i = 0; i < max; i++) {
+ K key = keys[i];
+ if (key != null && key != REMOVED) {
+ hash = 31 * hash + key.hashCode();
+ hash = 31 * hash + values[i].hashCode();
+ }
+ }
+ return hash;
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ if (!(other instanceof FastMap)) {
+ return false;
+ }
+ FastMap<K,V> otherMap = (FastMap<K,V>) other;
+ K[] otherKeys = otherMap.keys;
+ V[] otherValues = otherMap.values;
+ int length = keys.length;
+ int otherLength = otherKeys.length;
+ int max = Math.min(length, otherLength);
+
+ int i = 0;
+ while (i < max) {
+ K key = keys[i];
+ K otherKey = otherKeys[i];
+ if (key == null || key == REMOVED) {
+ if (otherKey != null && otherKey != REMOVED) {
+ return false;
+ }
+ } else {
+ if (key != otherKey || !values[i].equals(otherValues[i])) {
+ return false;
+ }
+ }
+ i++;
+ }
+ while (i < length) {
+ K key = keys[i];
+ if (key != null && key != REMOVED) {
+ return false;
+ }
+ i++;
+ }
+ while (i < otherLength) {
+ K key = otherKeys[i];
+ if (key != null && key != REMOVED) {
+ return false;
+ }
+ i++;
+ }
+ return true;
+ }
+
+ @Override
+ public String toString() {
+ if (isEmpty()) {
+ return "{}";
+ }
+ StringBuilder result = new StringBuilder();
+ result.append('{');
+ for (int i = 0; i < keys.length; i++) {
+ K key = keys[i];
+ if (key != null && key != REMOVED) {
+ result.append(key).append('=').append(values[i]).append(',');
+ }
+ }
+ result.setCharAt(result.length() - 1, '}');
+ return result.toString();
+ }
+
+ private final class EntrySet extends AbstractSet<Entry<K,V>> {
+
+ @Override
+ public int size() {
+ return FastMap.this.size();
+ }
+
+ @Override
+ public boolean isEmpty() {
+ return FastMap.this.isEmpty();
+ }
+
+ @Override
+ public boolean contains(Object o) {
+ return containsKey(o);
+ }
+
+ @Override
+ public Iterator<Entry<K,V>> iterator() {
+ return new EntryIterator();
+ }
+
+ @Override
+ public boolean add(Entry<K,V> t) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public boolean remove(Object o) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public boolean addAll(Collection<? extends Entry<K,V>> ts) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public boolean retainAll(Collection<?> objects) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public boolean removeAll(Collection<?> objects) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public void clear() {
+ FastMap.this.clear();
+ }
+
+ private final class MapEntry implements Entry<K,V> {
+
+ private final int index;
+
+ private MapEntry(int index) {
+ this.index = index;
+ }
+
+ @Override
+ public K getKey() {
+ return keys[index];
+ }
+
+ @Override
+ public V getValue() {
+ return values[index];
+ }
+
+ @Override
+ public V setValue(V value) {
+ Preconditions.checkArgument(value != null);
+ V oldValue = values[index];
+ values[index] = value;
+ return oldValue;
+ }
+ }
+
+ private final class EntryIterator implements Iterator<Entry<K,V>> {
+
+ private int position;
+ private int lastNext = -1;
+
+ @Override
+ public boolean hasNext() {
+ goToNext();
+ return position < keys.length;
+ }
+
+ @Override
+ public Entry<K,V> next() {
+ goToNext();
+ lastNext = position;
+ if (position >= keys.length) {
+ throw new NoSuchElementException();
+ }
+ return new MapEntry(position++);
+ }
+
+ private void goToNext() {
+ int length = values.length;
+ while (position < length && values[position] == null) {
+ position++;
+ }
+ }
+
+ @Override
+ public void remove() {
+ iteratorRemove(lastNext);
+ }
+ }
+
+ }
+
+ private final class KeySet extends AbstractSet<K> {
+
+ @Override
+ public int size() {
+ return FastMap.this.size();
+ }
+
+ @Override
+ public boolean isEmpty() {
+ return FastMap.this.isEmpty();
+ }
+
+ @Override
+ public boolean contains(Object o) {
+ return containsKey(o);
+ }
+
+ @Override
+ public Iterator<K> iterator() {
+ return new KeyIterator();
+ }
+
+ @Override
+ public boolean add(K t) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public boolean remove(Object o) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public boolean addAll(Collection<? extends K> ts) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public boolean retainAll(Collection<?> objects) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public boolean removeAll(Collection<?> objects) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public void clear() {
+ FastMap.this.clear();
+ }
+
+ private final class KeyIterator implements Iterator<K> {
+
+ private int position;
+ private int lastNext = -1;
+
+ @Override
+ public boolean hasNext() {
+ goToNext();
+ return position < keys.length;
+ }
+
+ @Override
+ public K next() {
+ goToNext();
+ lastNext = position;
+ if (position >= keys.length) {
+ throw new NoSuchElementException();
+ }
+ return keys[position++];
+ }
+
+ private void goToNext() {
+ int length = values.length;
+ while (position < length && values[position] == null) {
+ position++;
+ }
+ }
+
+ @Override
+ public void remove() {
+ iteratorRemove(lastNext);
+ }
+ }
+
+ }
+
+ private final class ValueCollection extends AbstractCollection<V> {
+
+ @Override
+ public int size() {
+ return FastMap.this.size();
+ }
+
+ @Override
+ public boolean isEmpty() {
+ return FastMap.this.isEmpty();
+ }
+
+ @Override
+ public boolean contains(Object o) {
+ return containsValue(o);
+ }
+
+ @Override
+ public Iterator<V> iterator() {
+ return new ValueIterator();
+ }
+
+ @Override
+ public boolean add(V v) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public boolean remove(Object o) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public boolean addAll(Collection<? extends V> vs) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public boolean removeAll(Collection<?> objects) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public boolean retainAll(Collection<?> objects) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public void clear() {
+ FastMap.this.clear();
+ }
+
+ private final class ValueIterator implements Iterator<V> {
+
+ private int position;
+ private int lastNext = -1;
+
+ @Override
+ public boolean hasNext() {
+ goToNext();
+ return position < values.length;
+ }
+
+ @Override
+ public V next() {
+ goToNext();
+ lastNext = position;
+ if (position >= values.length) {
+ throw new NoSuchElementException();
+ }
+ return values[position++];
+ }
+
+ private void goToNext() {
+ int length = values.length;
+ while (position < length && values[position] == null) {
+ position++;
+ }
+ }
+
+ @Override
+ public void remove() {
+ iteratorRemove(lastNext);
+ }
+
+ }
+
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/FixedRunningAverage.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/FixedRunningAverage.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/FixedRunningAverage.java
new file mode 100644
index 0000000..1863d2b
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/FixedRunningAverage.java
@@ -0,0 +1,83 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.common;
+
+import java.io.Serializable;
+
+/**
+ * <p>
+ * A simple class that represents a fixed value of an average and count. This is useful
+ * when an API needs to return {@link RunningAverage} but is not in a position to accept
+ * updates to it.
+ * </p>
+ */
+public class FixedRunningAverage implements RunningAverage, Serializable {
+
+ private final double average;
+ private final int count;
+
+ public FixedRunningAverage(double average, int count) {
+ this.average = average;
+ this.count = count;
+ }
+
+ /**
+ * @throws UnsupportedOperationException
+ */
+ @Override
+ public synchronized void addDatum(double datum) {
+ throw new UnsupportedOperationException();
+ }
+
+ /**
+ * @throws UnsupportedOperationException
+ */
+ @Override
+ public synchronized void removeDatum(double datum) {
+ throw new UnsupportedOperationException();
+ }
+
+ /**
+ * @throws UnsupportedOperationException
+ */
+ @Override
+ public synchronized void changeDatum(double delta) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public synchronized int getCount() {
+ return count;
+ }
+
+ @Override
+ public synchronized double getAverage() {
+ return average;
+ }
+
+ @Override
+ public RunningAverage inverse() {
+ return new InvertedRunningAverage(this);
+ }
+
+ @Override
+ public synchronized String toString() {
+ return String.valueOf(average);
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/FixedRunningAverageAndStdDev.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/FixedRunningAverageAndStdDev.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/FixedRunningAverageAndStdDev.java
new file mode 100644
index 0000000..619b6b7
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/FixedRunningAverageAndStdDev.java
@@ -0,0 +1,51 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.common;
+
+/**
+ * <p>
+ * A simple class that represents a fixed value of an average, count and standard deviation. This is useful
+ * when an API needs to return {@link RunningAverageAndStdDev} but is not in a position to accept
+ * updates to it.
+ * </p>
+ */
+public final class FixedRunningAverageAndStdDev extends FixedRunningAverage implements RunningAverageAndStdDev {
+
+ private final double stdDev;
+
+ public FixedRunningAverageAndStdDev(double average, double stdDev, int count) {
+ super(average, count);
+ this.stdDev = stdDev;
+ }
+
+ @Override
+ public RunningAverageAndStdDev inverse() {
+ return new InvertedRunningAverageAndStdDev(this);
+ }
+
+ @Override
+ public synchronized String toString() {
+ return super.toString() + ',' + stdDev;
+ }
+
+ @Override
+ public double getStandardDeviation() {
+ return stdDev;
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/FullRunningAverage.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/FullRunningAverage.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/FullRunningAverage.java
new file mode 100644
index 0000000..00d828f
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/FullRunningAverage.java
@@ -0,0 +1,109 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.common;
+
+import java.io.Serializable;
+
+/**
+ * <p>
+ * A simple class that can keep track of a running average of a series of numbers. One can add to or remove
+ * from the series, as well as update a datum in the series. The class does not actually keep track of the
+ * series of values, just its running average, so it doesn't even matter if you remove/change a value that
+ * wasn't added.
+ * </p>
+ */
+public class FullRunningAverage implements RunningAverage, Serializable {
+
+ private int count;
+ private double average;
+
+ public FullRunningAverage() {
+ this(0, Double.NaN);
+ }
+
+ public FullRunningAverage(int count, double average) {
+ this.count = count;
+ this.average = average;
+ }
+
+ /**
+ * @param datum
+ * new item to add to the running average
+ */
+ @Override
+ public synchronized void addDatum(double datum) {
+ if (++count == 1) {
+ average = datum;
+ } else {
+ average = average * (count - 1) / count + datum / count;
+ }
+ }
+
+ /**
+ * @param datum
+ * item to remove to the running average
+ * @throws IllegalStateException
+ * if count is 0
+ */
+ @Override
+ public synchronized void removeDatum(double datum) {
+ if (count == 0) {
+ throw new IllegalStateException();
+ }
+ if (--count == 0) {
+ average = Double.NaN;
+ } else {
+ average = average * (count + 1) / count - datum / count;
+ }
+ }
+
+ /**
+ * @param delta
+ * amount by which to change a datum in the running average
+ * @throws IllegalStateException
+ * if count is 0
+ */
+ @Override
+ public synchronized void changeDatum(double delta) {
+ if (count == 0) {
+ throw new IllegalStateException();
+ }
+ average += delta / count;
+ }
+
+ @Override
+ public synchronized int getCount() {
+ return count;
+ }
+
+ @Override
+ public synchronized double getAverage() {
+ return average;
+ }
+
+ @Override
+ public RunningAverage inverse() {
+ return new InvertedRunningAverage(this);
+ }
+
+ @Override
+ public synchronized String toString() {
+ return String.valueOf(average);
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/FullRunningAverageAndStdDev.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/FullRunningAverageAndStdDev.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/FullRunningAverageAndStdDev.java
new file mode 100644
index 0000000..6212e66
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/common/FullRunningAverageAndStdDev.java
@@ -0,0 +1,107 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.common;
+
+/**
+ * <p>
+ * Extends {@link FullRunningAverage} to add a running standard deviation computation.
+ * Uses Welford's method, as described at http://www.johndcook.com/standard_deviation.html
+ * </p>
+ */
+public final class FullRunningAverageAndStdDev extends FullRunningAverage implements RunningAverageAndStdDev {
+
+ private double stdDev;
+ private double mk;
+ private double sk;
+
+ public FullRunningAverageAndStdDev() {
+ mk = 0.0;
+ sk = 0.0;
+ recomputeStdDev();
+ }
+
+ public FullRunningAverageAndStdDev(int count, double average, double mk, double sk) {
+ super(count, average);
+ this.mk = mk;
+ this.sk = sk;
+ recomputeStdDev();
+ }
+
+ public double getMk() {
+ return mk;
+ }
+
+ public double getSk() {
+ return sk;
+ }
+
+ @Override
+ public synchronized double getStandardDeviation() {
+ return stdDev;
+ }
+
+ @Override
+ public synchronized void addDatum(double datum) {
+ super.addDatum(datum);
+ int count = getCount();
+ if (count == 1) {
+ mk = datum;
+ sk = 0.0;
+ } else {
+ double oldmk = mk;
+ double diff = datum - oldmk;
+ mk += diff / count;
+ sk += diff * (datum - mk);
+ }
+ recomputeStdDev();
+ }
+
+ @Override
+ public synchronized void removeDatum(double datum) {
+ int oldCount = getCount();
+ super.removeDatum(datum);
+ double oldmk = mk;
+ mk = (oldCount * oldmk - datum) / (oldCount - 1);
+ sk -= (datum - mk) * (datum - oldmk);
+ recomputeStdDev();
+ }
+
+ /**
+ * @throws UnsupportedOperationException
+ */
+ @Override
+ public void changeDatum(double delta) {
+ throw new UnsupportedOperationException();
+ }
+
+ private synchronized void recomputeStdDev() {
+ int count = getCount();
+ stdDev = count > 1 ? Math.sqrt(sk / (count - 1)) : Double.NaN;
+ }
+
+ @Override
+ public RunningAverageAndStdDev inverse() {
+ return new InvertedRunningAverageAndStdDev(this);
+ }
+
+ @Override
+ public synchronized String toString() {
+ return String.valueOf(String.valueOf(getAverage()) + ',' + stdDev);
+ }
+
+}
r***@apache.org
2018-06-28 14:55:18 UTC
Permalink
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/conf/log4j.xml
----------------------------------------------------------------------
diff --git a/community/mahout-mr/conf/log4j.xml b/community/mahout-mr/conf/log4j.xml
index 6231b48..179f1a9 100644
--- a/community/mahout-mr/conf/log4j.xml
+++ b/community/mahout-mr/conf/log4j.xml
@@ -1,4 +1,21 @@
<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
<!DOCTYPE log4j:configuration SYSTEM "log4j.dtd">
<log4j:configuration xmlns:log4j="http://jakarta.apache.org/log4j/">
<appender class="org.apache.log4j.ConsoleAppender" name="console">

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/bin/README.txt
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/bin/README.txt b/community/mahout-mr/examples/bin/README.txt
deleted file mode 100644
index 7ad3a38..0000000
--- a/community/mahout-mr/examples/bin/README.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-This directory contains helpful shell scripts for working with some of Mahout's examples.
-
-To set a non-default temporary work directory: `export MAHOUT_WORK_DIR=/path/in/hdfs/to/temp/dir`
- Note that this requires the same path to be writable both on the local file system as well as on HDFS.
-
-Here's a description of what each does:
-
-classify-20newsgroups.sh -- Run SGD and Bayes classifiers over the classic 20 News Groups. Downloads the data set automatically.
-cluster-reuters.sh -- Cluster the Reuters data set using a variety of algorithms. Downloads the data set automatically.
-cluster-syntheticcontrol.sh -- Cluster the Synthetic Control data set. Downloads the data set automatically.
-factorize-movielens-1m.sh -- Run the Alternating Least Squares Recommender on the Grouplens data set (size 1M).
-factorize-netflix.sh -- (Deprecated due to lack of availability of the data set) Run the ALS Recommender on the Netflix data set.
-spark-document-classifier.mscala -- A mahout-shell script which trains and tests a Naive Bayes model on the Wikipedia XML dump and defines simple methods to classify new text.

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/bin/classify-20newsgroups.sh
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/bin/classify-20newsgroups.sh b/community/mahout-mr/examples/bin/classify-20newsgroups.sh
deleted file mode 100755
index f47d5c5..0000000
--- a/community/mahout-mr/examples/bin/classify-20newsgroups.sh
+++ /dev/null
@@ -1,197 +0,0 @@
-#!/bin/bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-#
-# Downloads the 20newsgroups dataset, trains and tests a classifier.
-#
-# To run: change into the mahout directory and type:
-# examples/bin/classify-20newsgroups.sh
-
-if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
- echo "This script runs SGD and Bayes classifiers over the classic 20 News Groups."
- exit
-fi
-
-SCRIPT_PATH=${0%/*}
-if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then
- cd $SCRIPT_PATH
-fi
-START_PATH=`pwd`
-
-# Set commands for dfs
-source ${START_PATH}/set-dfs-commands.sh
-
-if [[ -z "$MAHOUT_WORK_DIR" ]]; then
- WORK_DIR=/tmp/mahout-work-${USER}
-else
- WORK_DIR=$MAHOUT_WORK_DIR
-fi
-algorithm=( cnaivebayes-MapReduce naivebayes-MapReduce cnaivebayes-Spark naivebayes-Spark sgd clean)
-if [ -n "$1" ]; then
- choice=$1
-else
- echo "Please select a number to choose the corresponding task to run"
- echo "1. ${algorithm[0]}"
- echo "2. ${algorithm[1]}"
- echo "3. ${algorithm[2]}"
- echo "4. ${algorithm[3]}"
- echo "5. ${algorithm[4]}"
- echo "6. ${algorithm[5]}-- cleans up the work area in $WORK_DIR"
- read -p "Enter your choice : " choice
-fi
-
-echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]}"
-alg=${algorithm[$choice-1]}
-
-# Spark specific check and work
-if [ "x$alg" == "xnaivebayes-Spark" -o "x$alg" == "xcnaivebayes-Spark" ]; then
- if [ "$MASTER" == "" ] ; then
- echo "Please set your MASTER env variable to point to your Spark Master URL. exiting..."
- exit 1
- fi
- if [ "$MAHOUT_LOCAL" != "" ] ; then
- echo "Options 3 and 4 can not run in MAHOUT_LOCAL mode. exiting..."
- exit 1
- fi
-fi
-
-if [ "x$alg" != "xclean" ]; then
- echo "creating work directory at ${WORK_DIR}"
-
- mkdir -p ${WORK_DIR}
- if [ ! -e ${WORK_DIR}/20news-bayesinput ]; then
- if [ ! -e ${WORK_DIR}/20news-bydate ]; then
- if [ ! -f ${WORK_DIR}/20news-bydate.tar.gz ]; then
- echo "Downloading 20news-bydate"
- curl http://people.csail.mit.edu/jrennie/20Newsgroups/20news-bydate.tar.gz -o ${WORK_DIR}/20news-bydate.tar.gz
- fi
- mkdir -p ${WORK_DIR}/20news-bydate
- echo "Extracting..."
- cd ${WORK_DIR}/20news-bydate && tar xzf ../20news-bydate.tar.gz && cd .. && cd ..
- fi
- fi
-fi
-#echo $START_PATH
-cd $START_PATH
-cd ../..
-
-set -e
-
-if ( [ "x$alg" == "xnaivebayes-MapReduce" ] || [ "x$alg" == "xcnaivebayes-MapReduce" ] || [ "x$alg" == "xnaivebayes-Spark" ] || [ "x$alg" == "xcnaivebayes-Spark" ] ); then
- c=""
-
- if [ "x$alg" == "xcnaivebayes-MapReduce" -o "x$alg" == "xnaivebayes-Spark" ]; then
- c=" -c"
- fi
-
- set -x
- echo "Preparing 20newsgroups data"
- rm -rf ${WORK_DIR}/20news-all
- mkdir ${WORK_DIR}/20news-all
- cp -R ${WORK_DIR}/20news-bydate/*/* ${WORK_DIR}/20news-all
-
- if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
- echo "Copying 20newsgroups data to HDFS"
- set +e
- $DFSRM ${WORK_DIR}/20news-all
- $DFS -mkdir -p ${WORK_DIR}
- $DFS -mkdir ${WORK_DIR}/20news-all
- set -e
- if [ $HVERSION -eq "1" ] ; then
- echo "Copying 20newsgroups data to Hadoop 1 HDFS"
- $DFS -put ${WORK_DIR}/20news-all ${WORK_DIR}/20news-all
- elif [ $HVERSION -eq "2" ] ; then
- echo "Copying 20newsgroups data to Hadoop 2 HDFS"
- $DFS -put ${WORK_DIR}/20news-all ${WORK_DIR}/
- fi
- fi
-
- echo "Creating sequence files from 20newsgroups data"
- ./bin/mahout seqdirectory \
- -i ${WORK_DIR}/20news-all \
- -o ${WORK_DIR}/20news-seq -ow
-
- echo "Converting sequence files to vectors"
- ./bin/mahout seq2sparse \
- -i ${WORK_DIR}/20news-seq \
- -o ${WORK_DIR}/20news-vectors -lnorm -nv -wt tfidf
-
- echo "Creating training and holdout set with a random 80-20 split of the generated vector dataset"
- ./bin/mahout split \
- -i ${WORK_DIR}/20news-vectors/tfidf-vectors \
- --trainingOutput ${WORK_DIR}/20news-train-vectors \
- --testOutput ${WORK_DIR}/20news-test-vectors \
- --randomSelectionPct 40 --overwrite --sequenceFiles -xm sequential
-
- if [ "x$alg" == "xnaivebayes-MapReduce" -o "x$alg" == "xcnaivebayes-MapReduce" ]; then
-
- echo "Training Naive Bayes model"
- ./bin/mahout trainnb \
- -i ${WORK_DIR}/20news-train-vectors \
- -o ${WORK_DIR}/model \
- -li ${WORK_DIR}/labelindex \
- -ow $c
-
- echo "Self testing on training set"
-
- ./bin/mahout testnb \
- -i ${WORK_DIR}/20news-train-vectors\
- -m ${WORK_DIR}/model \
- -l ${WORK_DIR}/labelindex \
- -ow -o ${WORK_DIR}/20news-testing $c
-
- echo "Testing on holdout set"
-
- ./bin/mahout testnb \
- -i ${WORK_DIR}/20news-test-vectors\
- -m ${WORK_DIR}/model \
- -l ${WORK_DIR}/labelindex \
- -ow -o ${WORK_DIR}/20news-testing $c
-
- elif [ "x$alg" == "xnaivebayes-Spark" -o "x$alg" == "xcnaivebayes-Spark" ]; then
-
- echo "Training Naive Bayes model"
- ./bin/mahout spark-trainnb \
- -i ${WORK_DIR}/20news-train-vectors \
- -o ${WORK_DIR}/spark-model $c -ow -ma $MASTER
-
- echo "Self testing on training set"
- ./bin/mahout spark-testnb \
- -i ${WORK_DIR}/20news-train-vectors\
- -m ${WORK_DIR}/spark-model $c -ma $MASTER
-
- echo "Testing on holdout set"
- ./bin/mahout spark-testnb \
- -i ${WORK_DIR}/20news-test-vectors\
- -m ${WORK_DIR}/spark-model $c -ma $MASTER
-
- fi
-elif [ "x$alg" == "xsgd" ]; then
- if [ ! -e "/tmp/news-group.model" ]; then
- echo "Training on ${WORK_DIR}/20news-bydate/20news-bydate-train/"
- ./bin/mahout org.apache.mahout.classifier.sgd.TrainNewsGroups ${WORK_DIR}/20news-bydate/20news-bydate-train/
- fi
- echo "Testing on ${WORK_DIR}/20news-bydate/20news-bydate-test/ with model: /tmp/news-group.model"
- ./bin/mahout org.apache.mahout.classifier.sgd.TestNewsGroups --input ${WORK_DIR}/20news-bydate/20news-bydate-test/ --model /tmp/news-group.model
-elif [ "x$alg" == "xclean" ]; then
- rm -rf $WORK_DIR
- rm -rf /tmp/news-group.model
- $DFSRM $WORK_DIR
-fi
-# Remove the work directory
-#

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/bin/classify-wikipedia.sh
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/bin/classify-wikipedia.sh b/community/mahout-mr/examples/bin/classify-wikipedia.sh
deleted file mode 100755
index 41dc0c9..0000000
--- a/community/mahout-mr/examples/bin/classify-wikipedia.sh
+++ /dev/null
@@ -1,196 +0,0 @@
-#!/bin/bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-#
-# Downloads a (partial) wikipedia dump, trains and tests a classifier.
-#
-# To run: change into the mahout directory and type:
-# examples/bin/classify-wikipedia.sh
-
-if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
- echo "This script Bayes and CBayes classifiers over the last wikipedia dump."
- exit
-fi
-
-# ensure that MAHOUT_HOME is set
-if [[ -z "$MAHOUT_HOME" ]]; then
- echo "Please set MAHOUT_HOME."
- exit
-fi
-
-SCRIPT_PATH=${0%/*}
-if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then
- cd $SCRIPT_PATH
-fi
-START_PATH=`pwd`
-
-# Set commands for dfs
-source ${START_PATH}/set-dfs-commands.sh
-
-if [[ -z "$MAHOUT_WORK_DIR" ]]; then
- WORK_DIR=/tmp/mahout-work-wiki
-else
- WORK_DIR=$MAHOUT_WORK_DIR
-fi
-algorithm=( CBayes BinaryCBayes clean)
-if [ -n "$1" ]; then
- choice=$1
-else
- echo "Please select a number to choose the corresponding task to run"
- echo "1. ${algorithm[0]} (may require increased heap space on yarn)"
- echo "2. ${algorithm[1]}"
- echo "3. ${algorithm[2]} -- cleans up the work area in $WORK_DIR"
- read -p "Enter your choice : " choice
-fi
-
-echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]}"
-alg=${algorithm[$choice-1]}
-
-if [ "x$alg" != "xclean" ]; then
- echo "creating work directory at ${WORK_DIR}"
-
- mkdir -p ${WORK_DIR}
- if [ ! -e ${WORK_DIR}/wikixml ]; then
- mkdir -p ${WORK_DIR}/wikixml
- fi
- if [ ! -e ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2 ]; then
- echo "Downloading wikipedia XML dump"
- ########################################################
- # Datasets: uncomment and run "clean" to change dataset
- ########################################################
- ########## partial small 42.5M zipped
- # curl https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles1.xml-p000000010p000030302.bz2 -o ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2
- ########## partial larger 256M zipped
- curl https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles10.xml-p2336425p3046511.bz2 -o ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2
- ######### full wikipedia dump: 10G zipped
- # curl https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2 -o ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2
- ########################################################
- fi
- if [ ! -e ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml ]; then
- echo "Extracting..."
-
- cd ${WORK_DIR}/wikixml && bunzip2 enwiki-latest-pages-articles.xml.bz2 && cd .. && cd ..
- fi
-
-echo $START_PATH
-
-set -e
-
-if [ "x$alg" == "xCBayes" ] || [ "x$alg" == "xBinaryCBayes" ] ; then
-
- set -x
- echo "Preparing wikipedia data"
- rm -rf ${WORK_DIR}/wiki
- mkdir ${WORK_DIR}/wiki
-
- if [ "x$alg" == "xCBayes" ] ; then
- # use a list of 10 countries as categories
- cp $MAHOUT_HOME/examples/bin/resources/country10.txt ${WORK_DIR}/country.txt
- chmod 666 ${WORK_DIR}/country.txt
- fi
-
- if [ "x$alg" == "xBinaryCBayes" ] ; then
- # use United States and United Kingdom as categories
- cp $MAHOUT_HOME/examples/bin/resources/country2.txt ${WORK_DIR}/country.txt
- chmod 666 ${WORK_DIR}/country.txt
- fi
-
- if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
- echo "Copying wikipedia data to HDFS"
- set +e
- $DFSRM ${WORK_DIR}/wikixml
- $DFS -mkdir -p ${WORK_DIR}
- set -e
- $DFS -put ${WORK_DIR}/wikixml ${WORK_DIR}/wikixml
- fi
-
- echo "Creating sequence files from wikiXML"
- $MAHOUT_HOME/bin/mahout seqwiki -c ${WORK_DIR}/country.txt \
- -i ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml \
- -o ${WORK_DIR}/wikipediainput
-
- # if using the 10 class problem use bigrams
- if [ "x$alg" == "xCBayes" ] ; then
- echo "Converting sequence files to vectors using bigrams"
- $MAHOUT_HOME/bin/mahout seq2sparse -i ${WORK_DIR}/wikipediainput \
- -o ${WORK_DIR}/wikipediaVecs \
- -wt tfidf \
- -lnorm -nv \
- -ow -ng 2
- fi
-
- # if using the 2 class problem try different options
- if [ "x$alg" == "xBinaryCBayes" ] ; then
- echo "Converting sequence files to vectors using unigrams and a max document frequency of 30%"
- $MAHOUT_HOME/bin/mahout seq2sparse -i ${WORK_DIR}/wikipediainput \
- -o ${WORK_DIR}/wikipediaVecs \
- -wt tfidf \
- -lnorm \
- -nv \
- -ow \
- -ng 1 \
- -x 30
- fi
-
- echo "Creating training and holdout set with a random 80-20 split of the generated vector dataset"
- $MAHOUT_HOME/bin/mahout split -i ${WORK_DIR}/wikipediaVecs/tfidf-vectors/ \
- --trainingOutput ${WORK_DIR}/training \
- --testOutput ${WORK_DIR}/testing \
- -rp 20 \
- -ow \
- -seq \
- -xm sequential
-
- echo "Training Naive Bayes model"
- $MAHOUT_HOME/bin/mahout trainnb -i ${WORK_DIR}/training \
- -o ${WORK_DIR}/model \
- -li ${WORK_DIR}/labelindex \
- -ow \
- -c
-
- echo "Self testing on training set"
- $MAHOUT_HOME/bin/mahout testnb -i ${WORK_DIR}/training \
- -m ${WORK_DIR}/model \
- -l ${WORK_DIR}/labelindex \
- -ow \
- -o ${WORK_DIR}/output \
- -c
-
- echo "Testing on holdout set: Bayes"
- $MAHOUT_HOME/bin/mahout testnb -i ${WORK_DIR}/testing \
- -m ${WORK_DIR}/model \
- -l ${WORK_DIR}/labelindex \
- -ow \
- -o ${WORK_DIR}/output \
- -seq
-
- echo "Testing on holdout set: CBayes"
- $MAHOUT_HOME/bin/mahout testnb -i ${WORK_DIR}/testing \
- -m ${WORK_DIR}/model -l \
- ${WORK_DIR}/labelindex \
- -ow \
- -o ${WORK_DIR}/output \
- -c \
- -seq
-fi
-
-elif [ "x$alg" == "xclean" ]; then
- rm -rf $WORK_DIR
- $DFSRM $WORK_DIR
-fi
-# Remove the work directory

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/bin/cluster-reuters.sh
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/bin/cluster-reuters.sh b/community/mahout-mr/examples/bin/cluster-reuters.sh
deleted file mode 100755
index 49f6c94..0000000
--- a/community/mahout-mr/examples/bin/cluster-reuters.sh
+++ /dev/null
@@ -1,203 +0,0 @@
-#!/bin/bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-#
-# Downloads the Reuters dataset and prepares it for clustering
-#
-# To run: change into the mahout directory and type:
-# examples/bin/cluster-reuters.sh
-
-if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
- echo "This script clusters the Reuters data set using a variety of algorithms. The data set is downloaded automatically."
- exit
-fi
-
-SCRIPT_PATH=${0%/*}
-if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then
- cd $SCRIPT_PATH
-fi
-START_PATH=`pwd`
-
-# Set commands for dfs
-source ${START_PATH}/set-dfs-commands.sh
-
-MAHOUT="../../bin/mahout"
-
-if [ ! -e $MAHOUT ]; then
- echo "Can't find mahout driver in $MAHOUT, cwd `pwd`, exiting.."
- exit 1
-fi
-
-if [[ -z "$MAHOUT_WORK_DIR" ]]; then
- WORK_DIR=/tmp/mahout-work-${USER}
-else
- WORK_DIR=$MAHOUT_WORK_DIR
-fi
-
-algorithm=( kmeans fuzzykmeans lda streamingkmeans clean)
-if [ -n "$1" ]; then
- choice=$1
-else
- echo "Please select a number to choose the corresponding clustering algorithm"
- echo "1. ${algorithm[0]} clustering (runs from this example script in cluster mode only)"
- echo "2. ${algorithm[1]} clustering (may require increased heap space on yarn)"
- echo "3. ${algorithm[2]} clustering"
- echo "4. ${algorithm[3]} clustering"
- echo "5. ${algorithm[4]} -- cleans up the work area in $WORK_DIR"
- read -p "Enter your choice : " choice
-fi
-
-echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]} Clustering"
-clustertype=${algorithm[$choice-1]}
-
-if [ "x$clustertype" == "xclean" ]; then
- rm -rf $WORK_DIR
- $DFSRM $WORK_DIR
- exit 1
-else
- $DFS -mkdir -p $WORK_DIR
- mkdir -p $WORK_DIR
- echo "Creating work directory at ${WORK_DIR}"
-fi
-if [ ! -e ${WORK_DIR}/reuters-out-seqdir ]; then
- if [ ! -e ${WORK_DIR}/reuters-out ]; then
- if [ ! -e ${WORK_DIR}/reuters-sgm ]; then
- if [ ! -f ${WORK_DIR}/reuters21578.tar.gz ]; then
- if [ -n "$2" ]; then
- echo "Copying Reuters from local download"
- cp $2 ${WORK_DIR}/reuters21578.tar.gz
- else
- echo "Downloading Reuters-21578"
- curl http://kdd.ics.uci.edu/databases/reuters21578/reuters21578.tar.gz -o ${WORK_DIR}/reuters21578.tar.gz
- fi
- fi
- #make sure it was actually downloaded
- if [ ! -f ${WORK_DIR}/reuters21578.tar.gz ]; then
- echo "Failed to download reuters"
- exit 1
- fi
- mkdir -p ${WORK_DIR}/reuters-sgm
- echo "Extracting..."
- tar xzf ${WORK_DIR}/reuters21578.tar.gz -C ${WORK_DIR}/reuters-sgm
- fi
- echo "Extracting Reuters"
- $MAHOUT org.apache.lucene.benchmark.utils.ExtractReuters ${WORK_DIR}/reuters-sgm ${WORK_DIR}/reuters-out
- if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
- echo "Copying Reuters data to Hadoop"
- set +e
- $DFSRM ${WORK_DIR}/reuters-sgm
- $DFSRM ${WORK_DIR}/reuters-out
- $DFS -mkdir -p ${WORK_DIR}/
- $DFS -mkdir ${WORK_DIR}/reuters-sgm
- $DFS -mkdir ${WORK_DIR}/reuters-out
- $DFS -put ${WORK_DIR}/reuters-sgm ${WORK_DIR}/reuters-sgm
- $DFS -put ${WORK_DIR}/reuters-out ${WORK_DIR}/reuters-out
- set -e
- fi
- fi
- echo "Converting to Sequence Files from Directory"
- $MAHOUT seqdirectory -i ${WORK_DIR}/reuters-out -o ${WORK_DIR}/reuters-out-seqdir -c UTF-8 -chunk 64 -xm sequential
-fi
-
-if [ "x$clustertype" == "xkmeans" ]; then
- $MAHOUT seq2sparse \
- -i ${WORK_DIR}/reuters-out-seqdir/ \
- -o ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans --maxDFPercent 85 --namedVector \
- && \
- $MAHOUT kmeans \
- -i ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans/tfidf-vectors/ \
- -c ${WORK_DIR}/reuters-kmeans-clusters \
- -o ${WORK_DIR}/reuters-kmeans \
- -dm org.apache.mahout.common.distance.EuclideanDistanceMeasure \
- -x 10 -k 20 -ow --clustering \
- && \
- $MAHOUT clusterdump \
- -i `$DFS -ls -d ${WORK_DIR}/reuters-kmeans/clusters-*-final | awk '{print $8}'` \
- -o ${WORK_DIR}/reuters-kmeans/clusterdump \
- -d ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans/dictionary.file-0 \
- -dt sequencefile -b 100 -n 20 --evaluate -dm org.apache.mahout.common.distance.EuclideanDistanceMeasure -sp 0 \
- --pointsDir ${WORK_DIR}/reuters-kmeans/clusteredPoints \
- && \
- cat ${WORK_DIR}/reuters-kmeans/clusterdump
-elif [ "x$clustertype" == "xfuzzykmeans" ]; then
- $MAHOUT seq2sparse \
- -i ${WORK_DIR}/reuters-out-seqdir/ \
- -o ${WORK_DIR}/reuters-out-seqdir-sparse-fkmeans --maxDFPercent 85 --namedVector \
- && \
- $MAHOUT fkmeans \
- -i ${WORK_DIR}/reuters-out-seqdir-sparse-fkmeans/tfidf-vectors/ \
- -c ${WORK_DIR}/reuters-fkmeans-clusters \
- -o ${WORK_DIR}/reuters-fkmeans \
- -dm org.apache.mahout.common.distance.EuclideanDistanceMeasure \
- -x 10 -k 20 -ow -m 1.1 \
- && \
- $MAHOUT clusterdump \
- -i ${WORK_DIR}/reuters-fkmeans/clusters-*-final \
- -o ${WORK_DIR}/reuters-fkmeans/clusterdump \
- -d ${WORK_DIR}/reuters-out-seqdir-sparse-fkmeans/dictionary.file-0 \
- -dt sequencefile -b 100 -n 20 -sp 0 \
- && \
- cat ${WORK_DIR}/reuters-fkmeans/clusterdump
-elif [ "x$clustertype" == "xlda" ]; then
- $MAHOUT seq2sparse \
- -i ${WORK_DIR}/reuters-out-seqdir/ \
- -o ${WORK_DIR}/reuters-out-seqdir-sparse-lda -ow --maxDFPercent 85 --namedVector \
- && \
- $MAHOUT rowid \
- -i ${WORK_DIR}/reuters-out-seqdir-sparse-lda/tfidf-vectors \
- -o ${WORK_DIR}/reuters-out-matrix \
- && \
- rm -rf ${WORK_DIR}/reuters-lda ${WORK_DIR}/reuters-lda-topics ${WORK_DIR}/reuters-lda-model \
- && \
- $MAHOUT cvb \
- -i ${WORK_DIR}/reuters-out-matrix/matrix \
- -o ${WORK_DIR}/reuters-lda -k 20 -ow -x 20 \
- -dict ${WORK_DIR}/reuters-out-seqdir-sparse-lda/dictionary.file-* \
- -dt ${WORK_DIR}/reuters-lda-topics \
- -mt ${WORK_DIR}/reuters-lda-model \
- && \
- $MAHOUT vectordump \
- -i ${WORK_DIR}/reuters-lda-topics/part-m-00000 \
- -o ${WORK_DIR}/reuters-lda/vectordump \
- -vs 10 -p true \
- -d ${WORK_DIR}/reuters-out-seqdir-sparse-lda/dictionary.file-* \
- -dt sequencefile -sort ${WORK_DIR}/reuters-lda-topics/part-m-00000 \
- && \
- cat ${WORK_DIR}/reuters-lda/vectordump
-elif [ "x$clustertype" == "xstreamingkmeans" ]; then
- $MAHOUT seq2sparse \
- -i ${WORK_DIR}/reuters-out-seqdir/ \
- -o ${WORK_DIR}/reuters-out-seqdir-sparse-streamingkmeans -ow --maxDFPercent 85 --namedVector \
- && \
- rm -rf ${WORK_DIR}/reuters-streamingkmeans \
- && \
- $MAHOUT streamingkmeans \
- -i ${WORK_DIR}/reuters-out-seqdir-sparse-streamingkmeans/tfidf-vectors/ \
- --tempDir ${WORK_DIR}/tmp \
- -o ${WORK_DIR}/reuters-streamingkmeans \
- -sc org.apache.mahout.math.neighborhood.FastProjectionSearch \
- -dm org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure \
- -k 10 -km 100 -ow \
- && \
- $MAHOUT qualcluster \
- -i ${WORK_DIR}/reuters-out-seqdir-sparse-streamingkmeans/tfidf-vectors/part-r-00000 \
- -c ${WORK_DIR}/reuters-streamingkmeans/part-r-00000 \
- -o ${WORK_DIR}/reuters-cluster-distance.csv \
- && \
- cat ${WORK_DIR}/reuters-cluster-distance.csv
-fi

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/bin/cluster-syntheticcontrol.sh
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/bin/cluster-syntheticcontrol.sh b/community/mahout-mr/examples/bin/cluster-syntheticcontrol.sh
deleted file mode 100755
index 796da33..0000000
--- a/community/mahout-mr/examples/bin/cluster-syntheticcontrol.sh
+++ /dev/null
@@ -1,105 +0,0 @@
-#!/bin/bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-#
-# Downloads the Synthetic control dataset and prepares it for clustering
-#
-# To run: change into the mahout directory and type:
-# examples/bin/cluster-syntheticcontrol.sh
-
-if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
- echo "This script clusters the Synthetic Control data set. The data set is downloaded automatically."
- exit
-fi
-
-algorithm=( kmeans fuzzykmeans )
-if [ -n "$1" ]; then
- choice=$1
-else
- echo "Please select a number to choose the corresponding clustering algorithm"
- echo "1. ${algorithm[0]} clustering"
- echo "2. ${algorithm[1]} clustering"
- read -p "Enter your choice : " choice
-fi
-echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]} Clustering"
-clustertype=${algorithm[$choice-1]}
-
-SCRIPT_PATH=${0%/*}
-if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then
- cd $SCRIPT_PATH
-fi
-START_PATH=`pwd`
-
-# Set commands for dfs
-source ${START_PATH}/set-dfs-commands.sh
-
-if [[ -z "$MAHOUT_WORK_DIR" ]]; then
- WORK_DIR=/tmp/mahout-work-${USER}
-else
- WORK_DIR=$MAHOUT_WORK_DIR
-fi
-
-echo "creating work directory at ${WORK_DIR}"
-mkdir -p ${WORK_DIR}
-if [ ! -f ${WORK_DIR}/synthetic_control.data ]; then
- if [ -n "$2" ]; then
- cp $2 ${WORK_DIR}/.
- else
- echo "Downloading Synthetic control data"
- curl http://archive.ics.uci.edu/ml/databases/synthetic_control/synthetic_control.data -o ${WORK_DIR}/synthetic_control.data
- fi
-fi
-if [ ! -f ${WORK_DIR}/synthetic_control.data ]; then
- echo "Couldn't download synthetic control"
- exit 1
-fi
-if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ]; then
- echo "Checking the health of DFS..."
- $DFS -ls /
- if [ $? -eq 0 ];then
- echo "DFS is healthy... "
- echo "Uploading Synthetic control data to HDFS"
- $DFSRM ${WORK_DIR}/testdata
- $DFS -mkdir -p ${WORK_DIR}/testdata
- $DFS -put ${WORK_DIR}/synthetic_control.data ${WORK_DIR}/testdata
- echo "Successfully Uploaded Synthetic control data to HDFS "
-
- options="--input ${WORK_DIR}/testdata --output ${WORK_DIR}/output --maxIter 10 --convergenceDelta 0.5"
-
- if [ "${clustertype}" == "kmeans" ]; then
- options="${options} --numClusters 6"
- # t1 & t2 not used if --numClusters specified, but parser requires input
- options="${options} --t1 1 --t2 2"
- ../../bin/mahout.bu org.apache.mahout.clustering.syntheticcontrol."${clustertype}".Job ${options}
- else
- options="${options} --m 2.0f --t1 80 --t2 55"
- ../../bin/mahout.bu org.apache.mahout.clustering.syntheticcontrol."${clustertype}".Job ${options}
- fi
- else
- echo " HADOOP is not running. Please make sure you hadoop is running. "
- fi
-elif [ "$MAHOUT_LOCAL" != "" ]; then
- echo "running MAHOUT_LOCAL"
- cp ${WORK_DIR}/synthetic_control.data testdata
- ../../bin/mahout.bu org.apache.mahout.clustering.syntheticcontrol."${clustertype}".Job
- rm testdata
-else
- echo " HADOOP_HOME variable is not set. Please set this environment variable and rerun the script"
-fi
-# Remove the work directory
-rm -rf ${WORK_DIR}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/bin/factorize-movielens-1M.sh
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/bin/factorize-movielens-1M.sh b/community/mahout-mr/examples/bin/factorize-movielens-1M.sh
deleted file mode 100755
index 29730e1..0000000
--- a/community/mahout-mr/examples/bin/factorize-movielens-1M.sh
+++ /dev/null
@@ -1,85 +0,0 @@
-#!/bin/bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# Instructions:
-#
-# Before using this script, you have to download and extract the Movielens 1M dataset
-# from http://www.grouplens.org/node/73
-#
-# To run: change into the mahout directory and type:
-# export MAHOUT_LOCAL=true
-# Then:
-# examples/bin/factorize-movielens-1M.sh /path/to/ratings.dat
-
-if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
- echo "This script runs the Alternating Least Squares Recommender on the Grouplens data set (size 1M)."
- echo "Syntax: $0 /path/to/ratings.dat\n"
- exit
-fi
-
-if [ $# -ne 1 ]
-then
- echo -e "\nYou have to download the Movielens 1M dataset from http://www.grouplens.org/node/73 before"
- echo -e "you can run this example. After that extract it and supply the path to the ratings.dat file.\n"
- echo -e "Syntax: $0 /path/to/ratings.dat\n"
- exit -1
-fi
-
-export MAHOUT_LOCAL=true
-MAHOUT="$MAHOUT_HOME/bin/mahout"
-
-if [[ -z "$MAHOUT_WORK_DIR" ]]; then
- WORK_DIR=/tmp/mahout-work-${USER}
-else
- WORK_DIR=$MAHOUT_WORK_DIR
-fi
-
-echo "creating work directory at ${WORK_DIR}"
-mkdir -p ${WORK_DIR}/movielens
-
-echo "Converting ratings..."
-cat $1 |sed -e s/::/,/g| cut -d, -f1,2,3 > ${WORK_DIR}/movielens/ratings.csv
-
-# create a 90% percent training set and a 10% probe set
-$MAHOUT splitDataset --input ${WORK_DIR}/movielens/ratings.csv --output ${WORK_DIR}/dataset \
- --trainingPercentage 0.9 --probePercentage 0.1 --tempDir ${WORK_DIR}/dataset/tmp
-
-# run distributed ALS-WR to factorize the rating matrix defined by the training set
-$MAHOUT parallelALS --input ${WORK_DIR}/dataset/trainingSet/ --output ${WORK_DIR}/als/out \
- --tempDir ${WORK_DIR}/als/tmp --numFeatures 20 --numIterations 10 --lambda 0.065 --numThreadsPerSolver 2
-
-# compute predictions against the probe set, measure the error
-$MAHOUT evaluateFactorization --input ${WORK_DIR}/dataset/probeSet/ --output ${WORK_DIR}/als/rmse/ \
- --userFeatures ${WORK_DIR}/als/out/U/ --itemFeatures ${WORK_DIR}/als/out/M/ --tempDir ${WORK_DIR}/als/tmp
-
-# compute recommendations
-$MAHOUT recommendfactorized --input ${WORK_DIR}/als/out/userRatings/ --output ${WORK_DIR}/recommendations/ \
- --userFeatures ${WORK_DIR}/als/out/U/ --itemFeatures ${WORK_DIR}/als/out/M/ \
- --numRecommendations 6 --maxRating 5 --numThreads 2
-
-# print the error
-echo -e "\nRMSE is:\n"
-cat ${WORK_DIR}/als/rmse/rmse.txt
-echo -e "\n"
-
-echo -e "\nSample recommendations:\n"
-shuf ${WORK_DIR}/recommendations/part-m-00000 |head
-echo -e "\n\n"
-
-echo "removing work directory"
-rm -rf ${WORK_DIR}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/bin/factorize-netflix.sh
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/bin/factorize-netflix.sh b/community/mahout-mr/examples/bin/factorize-netflix.sh
deleted file mode 100755
index 26faf66..0000000
--- a/community/mahout-mr/examples/bin/factorize-netflix.sh
+++ /dev/null
@@ -1,90 +0,0 @@
-#!/bin/bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# Instructions:
-#
-# You can only use this script in conjunction with the Netflix dataset. Unpack the Netflix dataset and provide the
-# following:
-#
-# 1) the path to the folder 'training_set' that contains all the movie rating files
-# 2) the path to the file 'qualifying.txt' that contains the user,item pairs to predict
-# 3) the path to the file 'judging.txt' that contains the ratings of user,item pairs to predict for
-#
-# To run:
-# ./factorize-netflix.sh /path/to/training_set/ /path/to/qualifying.txt /path/to/judging.txt
-
-echo "Note this script has been deprecated due to the lack of access to the Netflix data set."
-exit 1
-
-if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
- echo "This script runs the ALS Recommender on the Netflix data set."
- echo "Syntax: $0 /path/to/training_set/ /path/to/qualifying.txt /path/to/judging.txt\n"
- exit
-fi
-
-if [ $# -ne 3 ]
-then
- echo -e "Syntax: $0 /path/to/training_set/ /path/to/qualifying.txt /path/to/judging.txt\n"
- exit -1
-fi
-
-MAHOUT="../../bin/mahout"
-
-if [[ -z "$MAHOUT_WORK_DIR" ]]; then
- WORK_DIR=/tmp/mahout-work-${USER}
-else
- WORK_DIR=$MAHOUT_WORK_DIR
-fi
-
-START_PATH=`pwd`
-
-# Set commands for dfs
-source ${START_PATH}/set-dfs-commands.sh
-
-echo "Preparing data..."
-$MAHOUT org.apache.mahout.cf.taste.hadoop.example.als.netflix.NetflixDatasetConverter $1 $2 $3 ${WORK_DIR}
-
-# run distributed ALS-WR to factorize the rating matrix defined by the training set
-$MAHOUT parallelALS --input ${WORK_DIR}/trainingSet/ratings.tsv --output ${WORK_DIR}/als/out \
- --tempDir ${WORK_DIR}/als/tmp --numFeatures 25 --numIterations 10 --lambda 0.065 --numThreadsPerSolver 4
-
-# compute predictions against the probe set, measure the error
-$MAHOUT evaluateFactorization --input ${WORK_DIR}/probeSet/ratings.tsv --output ${WORK_DIR}/als/rmse/ \
- --userFeatures ${WORK_DIR}/als/out/U/ --itemFeatures ${WORK_DIR}/als/out/M/ --tempDir ${WORK_DIR}/als/tmp
-
-if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
-
- # print the error, should be around 0.923
- echo -e "\nRMSE is:\n"
- $DFS -tail ${WORK_DIR}/als/rmse/rmse.txt
- echo -e "\n"
- echo "removing work directory"
- set +e
- $DFSRM ${WORK_DIR}
-
-else
-
- # print the error, should be around 0.923
- echo -e "\nRMSE is:\n"
- cat ${WORK_DIR}/als/rmse/rmse.txt
- echo -e "\n"
- echo "removing work directory"
- rm -rf ${WORK_DIR}
-
-fi
-

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/bin/get-all-examples.sh
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/bin/get-all-examples.sh b/community/mahout-mr/examples/bin/get-all-examples.sh
deleted file mode 100755
index 4128e47..0000000
--- a/community/mahout-mr/examples/bin/get-all-examples.sh
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/usr/bin/env bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Clones Mahout example code from remote repositories with their own
-# build process. Follow the README for each example for instructions.
-#
-# Usage: change into the mahout directory and type:
-# examples/bin/get-all-examples.sh
-
-# Solr-recommender
-echo " Solr-recommender example: "
-echo " 1) imports text 'log files' of some delimited form for user preferences"
-echo " 2) creates the correct Mahout files and stores distionaries to translate external Id to and from Mahout Ids"
-echo " 3) it implements a prototype two actions 'cross-recommender', which takes two actions made by the same user and creates recommendations"
-echo " 4) it creates output for user->preference history CSV and and item->similar items 'similarity' matrix for use in a Solr-recommender."
-echo " To use Solr you would index the similarity matrix CSV, and use user preference history from the history CSV as a query, the result"
-echo " from Solr will be an ordered list of recommendations returning the same item Ids as were input."
-echo " For further description see the README.md here https://github.com/pferrel/solr-recommender"
-echo " To build run 'cd solr-recommender; mvn install'"
-echo " To process the example after building make sure MAHOUT_LOCAL IS SET and hadoop is in local mode then "
-echo " run 'cd scripts; ./solr-recommender-example'"
-git clone https://github.com/pferrel/solr-recommender

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/bin/lda.algorithm
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/bin/lda.algorithm b/community/mahout-mr/examples/bin/lda.algorithm
deleted file mode 100644
index fb84ea0..0000000
--- a/community/mahout-mr/examples/bin/lda.algorithm
+++ /dev/null
@@ -1,45 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-merge.policy=org.apache.lucene.index.LogDocMergePolicy
-merge.factor=mrg:10:20
-max.buffered=buf:100:1000
-compound=true
-
-analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer
-directory=FSDirectory
-
-doc.stored=true
-doc.term.vector=true
-doc.tokenized=true
-log.step=600
-
-content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
-content.source.forever=false
-doc.maker.forever=false
-query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker
-
-# task at this depth or less would print when they start
-task.max.depth.log=2
-
-log.queries=false
-# --------- alg
-{ "BuildReuters"
- CreateIndex
- { "AddDocs" AddDoc > : *
-# Optimize
- CloseIndex
-}
-
r***@apache.org
2018-06-28 14:54:44 UTC
Permalink
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/recommender/Rescorer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/recommender/Rescorer.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/recommender/Rescorer.java
new file mode 100644
index 0000000..1490761
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/recommender/Rescorer.java
@@ -0,0 +1,52 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.recommender;
+
+/**
+ * <p>
+ * A {@link Rescorer} simply assigns a new "score" to a thing like an ID of an item or user which a
+ * {@link Recommender} is considering returning as a top recommendation. It may be used to arbitrarily re-rank
+ * the results according to application-specific logic before returning recommendations. For example, an
+ * application may want to boost the score of items in a certain category just for one request.
+ * </p>
+ *
+ * <p>
+ * A {@link Rescorer} can also exclude a thing from consideration entirely by returning {@code true} from
+ * {@link #isFiltered(Object)}.
+ * </p>
+ */
+public interface Rescorer<T> {
+
+ /**
+ * @param thing
+ * thing to rescore
+ * @param originalScore
+ * original score
+ * @return modified score, or {@link Double#NaN} to indicate that this should be excluded entirely
+ */
+ double rescore(T thing, double originalScore);
+
+ /**
+ * Returns {@code true} to exclude the given thing.
+ *
+ * @param thing
+ * the thing to filter
+ * @return {@code true} to exclude, {@code false} otherwise
+ */
+ boolean isFiltered(T thing);
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/recommender/UserBasedRecommender.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/recommender/UserBasedRecommender.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/recommender/UserBasedRecommender.java
new file mode 100644
index 0000000..b48593a
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/recommender/UserBasedRecommender.java
@@ -0,0 +1,54 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.cf.taste.recommender;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.common.LongPair;
+
+/**
+ * <p>
+ * Interface implemented by "user-based" recommenders.
+ * </p>
+ */
+public interface UserBasedRecommender extends Recommender {
+
+ /**
+ * @param userID
+ * ID of user for which to find most similar other users
+ * @param howMany
+ * desired number of most similar users to find
+ * @return users most similar to the given user
+ * @throws TasteException
+ * if an error occurs while accessing the {@link org.apache.mahout.cf.taste.model.DataModel}
+ */
+ long[] mostSimilarUserIDs(long userID, int howMany) throws TasteException;
+
+ /**
+ * @param userID
+ * ID of user for which to find most similar other users
+ * @param howMany
+ * desired number of most similar users to find
+ * @param rescorer
+ * {@link Rescorer} which can adjust user-user similarity estimates used to determine most similar
+ * users
+ * @return IDs of users most similar to the given user
+ * @throws TasteException
+ * if an error occurs while accessing the {@link org.apache.mahout.cf.taste.model.DataModel}
+ */
+ long[] mostSimilarUserIDs(long userID, int howMany, Rescorer<LongPair> rescorer) throws TasteException;
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/similarity/ItemSimilarity.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/similarity/ItemSimilarity.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/similarity/ItemSimilarity.java
new file mode 100644
index 0000000..814610b
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/similarity/ItemSimilarity.java
@@ -0,0 +1,64 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.similarity;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+
+/**
+ * <p>
+ * Implementations of this interface define a notion of similarity between two items. Implementations should
+ * return values in the range -1.0 to 1.0, with 1.0 representing perfect similarity.
+ * </p>
+ *
+ * @see UserSimilarity
+ */
+public interface ItemSimilarity extends Refreshable {
+
+ /**
+ * <p>
+ * Returns the degree of similarity, of two items, based on the preferences that users have expressed for
+ * the items.
+ * </p>
+ *
+ * @param itemID1 first item ID
+ * @param itemID2 second item ID
+ * @return similarity between the items, in [-1,1] or {@link Double#NaN} similarity is unknown
+ * @throws org.apache.mahout.cf.taste.common.NoSuchItemException
+ * if either item is known to be non-existent in the data
+ * @throws TasteException if an error occurs while accessing the data
+ */
+ double itemSimilarity(long itemID1, long itemID2) throws TasteException;
+
+ /**
+ * <p>A bulk-get version of {@link #itemSimilarity(long, long)}.</p>
+ *
+ * @param itemID1 first item ID
+ * @param itemID2s second item IDs to compute similarity with
+ * @return similarity between itemID1 and other items
+ * @throws org.apache.mahout.cf.taste.common.NoSuchItemException
+ * if any item is known to be non-existent in the data
+ * @throws TasteException if an error occurs while accessing the data
+ */
+ double[] itemSimilarities(long itemID1, long[] itemID2s) throws TasteException;
+
+ /**
+ * @return all IDs of similar items, in no particular order
+ */
+ long[] allSimilarItemIDs(long itemID) throws TasteException;
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/similarity/PreferenceInferrer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/similarity/PreferenceInferrer.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/similarity/PreferenceInferrer.java
new file mode 100644
index 0000000..76bb328
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/similarity/PreferenceInferrer.java
@@ -0,0 +1,47 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.similarity;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+
+/**
+ * <p>
+ * Implementations of this interface compute an inferred preference for a user and an item that the user has
+ * not expressed any preference for. This might be an average of other preferences scores from that user, for
+ * example. This technique is sometimes called "default voting".
+ * </p>
+ */
+public interface PreferenceInferrer extends Refreshable {
+
+ /**
+ * <p>
+ * Infers the given user's preference value for an item.
+ * </p>
+ *
+ * @param userID
+ * ID of user to infer preference for
+ * @param itemID
+ * item ID to infer preference for
+ * @return inferred preference
+ * @throws TasteException
+ * if an error occurs while inferring
+ */
+ float inferPreference(long userID, long itemID) throws TasteException;
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/similarity/UserSimilarity.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/similarity/UserSimilarity.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/similarity/UserSimilarity.java
new file mode 100644
index 0000000..bd53c51
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/similarity/UserSimilarity.java
@@ -0,0 +1,58 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.similarity;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+
+/**
+ * <p>
+ * Implementations of this interface define a notion of similarity between two users. Implementations should
+ * return values in the range -1.0 to 1.0, with 1.0 representing perfect similarity.
+ * </p>
+ *
+ * @see ItemSimilarity
+ */
+public interface UserSimilarity extends Refreshable {
+
+ /**
+ * <p>
+ * Returns the degree of similarity, of two users, based on the their preferences.
+ * </p>
+ *
+ * @param userID1 first user ID
+ * @param userID2 second user ID
+ * @return similarity between the users, in [-1,1] or {@link Double#NaN} similarity is unknown
+ * @throws org.apache.mahout.cf.taste.common.NoSuchUserException
+ * if either user is known to be non-existent in the data
+ * @throws TasteException if an error occurs while accessing the data
+ */
+ double userSimilarity(long userID1, long userID2) throws TasteException;
+
+ // Should we implement userSimilarities() like ItemSimilarity.itemSimilarities()?
+
+ /**
+ * <p>
+ * Attaches a {@link PreferenceInferrer} to the {@link UserSimilarity} implementation.
+ * </p>
+ *
+ * @param inferrer {@link PreferenceInferrer}
+ */
+ void setPreferenceInferrer(PreferenceInferrer inferrer);
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/similarity/precompute/BatchItemSimilarities.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/similarity/precompute/BatchItemSimilarities.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/similarity/precompute/BatchItemSimilarities.java
new file mode 100644
index 0000000..b934d0c
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/similarity/precompute/BatchItemSimilarities.java
@@ -0,0 +1,56 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.similarity.precompute;
+
+import org.apache.mahout.cf.taste.recommender.ItemBasedRecommender;
+
+import java.io.IOException;
+
+public abstract class BatchItemSimilarities {
+
+ private final ItemBasedRecommender recommender;
+ private final int similarItemsPerItem;
+
+ /**
+ * @param recommender recommender to use
+ * @param similarItemsPerItem number of similar items to compute per item
+ */
+ protected BatchItemSimilarities(ItemBasedRecommender recommender, int similarItemsPerItem) {
+ this.recommender = recommender;
+ this.similarItemsPerItem = similarItemsPerItem;
+ }
+
+ protected ItemBasedRecommender getRecommender() {
+ return recommender;
+ }
+
+ protected int getSimilarItemsPerItem() {
+ return similarItemsPerItem;
+ }
+
+ /**
+ * @param degreeOfParallelism number of threads to use for the computation
+ * @param maxDurationInHours maximum duration of the computation
+ * @param writer {@link SimilarItemsWriter} used to persist the results
+ * @return the number of similarities precomputed
+ * @throws IOException
+ * @throws RuntimeException if the computation takes longer than maxDurationInHours
+ */
+ public abstract int computeItemSimilarities(int degreeOfParallelism, int maxDurationInHours,
+ SimilarItemsWriter writer) throws IOException;
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/similarity/precompute/SimilarItem.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/similarity/precompute/SimilarItem.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/similarity/precompute/SimilarItem.java
new file mode 100644
index 0000000..5d40051
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/similarity/precompute/SimilarItem.java
@@ -0,0 +1,56 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.similarity.precompute;
+
+import com.google.common.primitives.Doubles;
+
+import java.util.Comparator;
+
+/**
+ * Modeling similarity towards another item
+ */
+public class SimilarItem {
+
+ public static final Comparator<SimilarItem> COMPARE_BY_SIMILARITY = new Comparator<SimilarItem>() {
+ @Override
+ public int compare(SimilarItem s1, SimilarItem s2) {
+ return Doubles.compare(s1.similarity, s2.similarity);
+ }
+ };
+
+ private long itemID;
+ private double similarity;
+
+ public SimilarItem(long itemID, double similarity) {
+ set(itemID, similarity);
+ }
+
+ public void set(long itemID, double similarity) {
+ this.itemID = itemID;
+ this.similarity = similarity;
+ }
+
+ public long getItemID() {
+ return itemID;
+ }
+
+ public double getSimilarity() {
+ return similarity;
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/similarity/precompute/SimilarItems.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/similarity/precompute/SimilarItems.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/similarity/precompute/SimilarItems.java
new file mode 100644
index 0000000..057e996
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/similarity/precompute/SimilarItems.java
@@ -0,0 +1,84 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.similarity.precompute;
+
+import com.google.common.collect.UnmodifiableIterator;
+import org.apache.mahout.cf.taste.recommender.RecommendedItem;
+
+import java.util.Iterator;
+import java.util.List;
+import java.util.NoSuchElementException;
+
+/**
+ * Compact representation of all similar items for an item
+ */
+public class SimilarItems {
+
+ private final long itemID;
+ private final long[] similarItemIDs;
+ private final double[] similarities;
+
+ public SimilarItems(long itemID, List<RecommendedItem> similarItems) {
+ this.itemID = itemID;
+
+ int numSimilarItems = similarItems.size();
+ similarItemIDs = new long[numSimilarItems];
+ similarities = new double[numSimilarItems];
+
+ for (int n = 0; n < numSimilarItems; n++) {
+ similarItemIDs[n] = similarItems.get(n).getItemID();
+ similarities[n] = similarItems.get(n).getValue();
+ }
+ }
+
+ public long getItemID() {
+ return itemID;
+ }
+
+ public int numSimilarItems() {
+ return similarItemIDs.length;
+ }
+
+ public Iterable<SimilarItem> getSimilarItems() {
+ return new Iterable<SimilarItem>() {
+ @Override
+ public Iterator<SimilarItem> iterator() {
+ return new SimilarItemsIterator();
+ }
+ };
+ }
+
+ private class SimilarItemsIterator extends UnmodifiableIterator<SimilarItem> {
+
+ private int index = -1;
+
+ @Override
+ public boolean hasNext() {
+ return index < (similarItemIDs.length - 1);
+ }
+
+ @Override
+ public SimilarItem next() {
+ if (!hasNext()) {
+ throw new NoSuchElementException();
+ }
+ index++;
+ return new SimilarItem(similarItemIDs[index], similarities[index]);
+ }
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/similarity/precompute/SimilarItemsWriter.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/similarity/precompute/SimilarItemsWriter.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/similarity/precompute/SimilarItemsWriter.java
new file mode 100644
index 0000000..35d6bfe
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/similarity/precompute/SimilarItemsWriter.java
@@ -0,0 +1,33 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.similarity.precompute;
+
+import java.io.Closeable;
+import java.io.IOException;
+
+/**
+ * Used to persist the results of a batch item similarity computation
+ * conducted with a {@link BatchItemSimilarities} implementation
+ */
+public interface SimilarItemsWriter extends Closeable {
+
+ void open() throws IOException;
+
+ void add(SimilarItems similarItems) throws IOException;
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/AbstractVectorClassifier.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/AbstractVectorClassifier.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/AbstractVectorClassifier.java
new file mode 100644
index 0000000..efd233f
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/AbstractVectorClassifier.java
@@ -0,0 +1,248 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier;
+
+import org.apache.mahout.math.DenseMatrix;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.Matrix;
+import org.apache.mahout.math.Vector;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * Defines the interface for classifiers that take a vector as input. This is
+ * implemented as an abstract class so that it can implement a number of handy
+ * convenience methods related to classification of vectors.
+ *
+ * <p>
+ * A classifier takes an input vector and calculates the scores (usually
+ * probabilities) that the input vector belongs to one of {@code n}
+ * categories. In {@code AbstractVectorClassifier} each category is denoted
+ * by an integer {@code c} between {@code 0} and {@code n-1}
+ * (inclusive).
+ *
+ * <p>
+ * New users should start by looking at {@link #classifyFull} (not {@link #classify}).
+ *
+ */
+public abstract class AbstractVectorClassifier {
+
+ /** Minimum allowable log likelihood value. */
+ public static final double MIN_LOG_LIKELIHOOD = -100.0;
+
+ /**
+ * Returns the number of categories that a target variable can be assigned to.
+ * A vector classifier will encode it's output as an integer from
+ * {@code 0} to {@code numCategories()-1} (inclusive).
+ *
+ * @return The number of categories.
+ */
+ public abstract int numCategories();
+
+ /**
+ * Compute and return a vector containing {@code n-1} scores, where
+ * {@code n} is equal to {@code numCategories()}, given an input
+ * vector {@code instance}. Higher scores indicate that the input vector
+ * is more likely to belong to that category. The categories are denoted by
+ * the integers {@code 0} through {@code n-1} (inclusive), and the
+ * scores in the returned vector correspond to categories 1 through
+ * {@code n-1} (leaving out category 0). It is assumed that the score for
+ * category 0 is one minus the sum of the scores in the returned vector.
+ *
+ * @param instance A feature vector to be classified.
+ * @return A vector of probabilities in 1 of {@code n-1} encoding.
+ */
+ public abstract Vector classify(Vector instance);
+
+ /**
+ * Compute and return a vector of scores before applying the inverse link
+ * function. For logistic regression and other generalized linear models, this
+ * is just the linear part of the classification.
+ *
+ * <p>
+ * The implementation of this method provided by {@code AbstractVectorClassifier} throws an
+ * {@link UnsupportedOperationException}. Your subclass must explicitly override this method to support
+ * this operation.
+ *
+ * @param features A feature vector to be classified.
+ * @return A vector of scores. If transformed by the link function, these will become probabilities.
+ */
+ public Vector classifyNoLink(Vector features) {
+ throw new UnsupportedOperationException(this.getClass().getName()
+ + " doesn't support classification without a link");
+ }
+
+ /**
+ * Classifies a vector in the special case of a binary classifier where
+ * {@link #classify(Vector)} would return a vector with only one element. As
+ * such, using this method can avoid the allocation of a vector.
+ *
+ * @param instance The feature vector to be classified.
+ * @return The score for category 1.
+ *
+ * @see #classify(Vector)
+ */
+ public abstract double classifyScalar(Vector instance);
+
+ /**
+ * Computes and returns a vector containing {@code n} scores, where
+ * {@code n} is {@code numCategories()}, given an input vector
+ * {@code instance}. Higher scores indicate that the input vector is more
+ * likely to belong to the corresponding category. The categories are denoted
+ * by the integers {@code 0} through {@code n-1} (inclusive).
+ *
+ * <p>
+ * Using this method it is possible to classify an input vector, for example,
+ * by selecting the category with the largest score. If
+ * {@code classifier} is an instance of
+ * {@code AbstractVectorClassifier} and {@code input} is a
+ * {@code Vector} of features describing an element to be classified,
+ * then the following code could be used to classify {@code input}.<br>
+ * {@code
+ * Vector scores = classifier.classifyFull(input);<br>
+ * int assignedCategory = scores.maxValueIndex();<br>
+ * } Here {@code assignedCategory} is the index of the category
+ * with the maximum score.
+ *
+ * <p>
+ * If an {@code n-1} encoding is acceptable, and allocation performance
+ * is an issue, then the {@link #classify(Vector)} method is probably better
+ * to use.
+ *
+ * @see #classify(Vector)
+ * @see #classifyFull(Vector r, Vector instance)
+ *
+ * @param instance A vector of features to be classified.
+ * @return A vector of probabilities, one for each category.
+ */
+ public Vector classifyFull(Vector instance) {
+ return classifyFull(new DenseVector(numCategories()), instance);
+ }
+
+ /**
+ * Computes and returns a vector containing {@code n} scores, where
+ * {@code n} is {@code numCategories()}, given an input vector
+ * {@code instance}. Higher scores indicate that the input vector is more
+ * likely to belong to the corresponding category. The categories are denoted
+ * by the integers {@code 0} through {@code n-1} (inclusive). The
+ * main difference between this method and {@link #classifyFull(Vector)} is
+ * that this method allows a user to provide a previously allocated
+ * {@code Vector r} to store the returned scores.
+ *
+ * <p>
+ * Using this method it is possible to classify an input vector, for example,
+ * by selecting the category with the largest score. If
+ * {@code classifier} is an instance of
+ * {@code AbstractVectorClassifier}, {@code result} is a non-null
+ * {@code Vector}, and {@code input} is a {@code Vector} of
+ * features describing an element to be classified, then the following code
+ * could be used to classify {@code input}.<br>
+ * {@code
+ * Vector scores = classifier.classifyFull(result, input); // Notice that scores == result<br>
+ * int assignedCategory = scores.maxValueIndex();<br>
+ * } Here {@code assignedCategory} is the index of the category
+ * with the maximum score.
+ *
+ * @param r Where to put the results.
+ * @param instance A vector of features to be classified.
+ * @return A vector of scores/probabilities, one for each category.
+ */
+ public Vector classifyFull(Vector r, Vector instance) {
+ r.viewPart(1, numCategories() - 1).assign(classify(instance));
+ r.setQuick(0, 1.0 - r.zSum());
+ return r;
+ }
+
+
+ /**
+ * Returns n-1 probabilities, one for each categories 1 through
+ * {@code n-1}, for each row of a matrix, where {@code n} is equal
+ * to {@code numCategories()}. The probability of the missing 0-th
+ * category is 1 - rowSum(this result).
+ *
+ * @param data The matrix whose rows are the input vectors to classify
+ * @return A matrix of scores, one row per row of the input matrix, one column for each but the last category.
+ */
+ public Matrix classify(Matrix data) {
+ Matrix r = new DenseMatrix(data.numRows(), numCategories() - 1);
+ for (int row = 0; row < data.numRows(); row++) {
+ r.assignRow(row, classify(data.viewRow(row)));
+ }
+ return r;
+ }
+
+ /**
+ * Returns a matrix where the rows of the matrix each contain {@code n} probabilities, one for each category.
+ *
+ * @param data The matrix whose rows are the input vectors to classify
+ * @return A matrix of scores, one row per row of the input matrix, one column for each but the last category.
+ */
+ public Matrix classifyFull(Matrix data) {
+ Matrix r = new DenseMatrix(data.numRows(), numCategories());
+ for (int row = 0; row < data.numRows(); row++) {
+ classifyFull(r.viewRow(row), data.viewRow(row));
+ }
+ return r;
+ }
+
+ /**
+ * Returns a vector of probabilities of category 1, one for each row
+ * of a matrix. This only makes sense if there are exactly two categories, but
+ * calling this method in that case can save a number of vector allocations.
+ *
+ * @param data The matrix whose rows are vectors to classify
+ * @return A vector of scores, with one value per row of the input matrix.
+ */
+ public Vector classifyScalar(Matrix data) {
+ Preconditions.checkArgument(numCategories() == 2, "Can only call classifyScalar with two categories");
+
+ Vector r = new DenseVector(data.numRows());
+ for (int row = 0; row < data.numRows(); row++) {
+ r.set(row, classifyScalar(data.viewRow(row)));
+ }
+ return r;
+ }
+
+ /**
+ * Returns a measure of how good the classification for a particular example
+ * actually is.
+ *
+ * @param actual The correct category for the example.
+ * @param data The vector to be classified.
+ * @return The log likelihood of the correct answer as estimated by the current model. This will always be <= 0
+ * and larger (closer to 0) indicates better accuracy. In order to simplify code that maintains eunning averages,
+ * we bound this value at -100.
+ */
+ public double logLikelihood(int actual, Vector data) {
+ if (numCategories() == 2) {
+ double p = classifyScalar(data);
+ if (actual > 0) {
+ return Math.max(MIN_LOG_LIKELIHOOD, Math.log(p));
+ } else {
+ return Math.max(MIN_LOG_LIKELIHOOD, Math.log1p(-p));
+ }
+ } else {
+ Vector p = classify(data);
+ if (actual > 0) {
+ return Math.max(MIN_LOG_LIKELIHOOD, Math.log(p.get(actual - 1)));
+ } else {
+ return Math.max(MIN_LOG_LIKELIHOOD, Math.log1p(-p.zSum()));
+ }
+ }
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/ClassifierResult.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/ClassifierResult.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/ClassifierResult.java
new file mode 100644
index 0000000..29eaa0d
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/ClassifierResult.java
@@ -0,0 +1,74 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier;
+
+/**
+ * Result of a document classification. The label and the associated score (usually probabilty)
+ */
+public class ClassifierResult {
+
+ private String label;
+ private double score;
+ private double logLikelihood = Double.MAX_VALUE;
+
+ public ClassifierResult() { }
+
+ public ClassifierResult(String label, double score) {
+ this.label = label;
+ this.score = score;
+ }
+
+ public ClassifierResult(String label) {
+ this.label = label;
+ }
+
+ public ClassifierResult(String label, double score, double logLikelihood) {
+ this.label = label;
+ this.score = score;
+ this.logLikelihood = logLikelihood;
+ }
+
+ public double getLogLikelihood() {
+ return logLikelihood;
+ }
+
+ public void setLogLikelihood(double logLikelihood) {
+ this.logLikelihood = logLikelihood;
+ }
+
+ public String getLabel() {
+ return label;
+ }
+
+ public double getScore() {
+ return score;
+ }
+
+ public void setLabel(String label) {
+ this.label = label;
+ }
+
+ public void setScore(double score) {
+ this.score = score;
+ }
+
+ @Override
+ public String toString() {
+ return "ClassifierResult{" + "category='" + label + '\'' + ", score=" + score + '}';
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/ConfusionMatrix.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/ConfusionMatrix.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/ConfusionMatrix.java
new file mode 100644
index 0000000..73ba521
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/ConfusionMatrix.java
@@ -0,0 +1,444 @@
+/**
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier;
+
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.Map;
+
+import com.google.common.base.Preconditions;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.commons.math3.stat.descriptive.moment.Mean;
+import org.apache.mahout.cf.taste.impl.common.FullRunningAverageAndStdDev;
+import org.apache.mahout.cf.taste.impl.common.RunningAverageAndStdDev;
+import org.apache.mahout.math.DenseMatrix;
+import org.apache.mahout.math.Matrix;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * The ConfusionMatrix Class stores the result of Classification of a Test Dataset.
+ *
+ * The fact of whether there is a default is not stored. A row of zeros is the only indicator that there is no default.
+ *
+ * See http://en.wikipedia.org/wiki/Confusion_matrix for background
+ */
+public class ConfusionMatrix {
+ private static final Logger LOG = LoggerFactory.getLogger(ConfusionMatrix.class);
+ private final Map<String,Integer> labelMap = new LinkedHashMap<>();
+ private final int[][] confusionMatrix;
+ private int samples = 0;
+ private String defaultLabel = "unknown";
+
+ public ConfusionMatrix(Collection<String> labels, String defaultLabel) {
+ confusionMatrix = new int[labels.size() + 1][labels.size() + 1];
+ this.defaultLabel = defaultLabel;
+ int i = 0;
+ for (String label : labels) {
+ labelMap.put(label, i++);
+ }
+ labelMap.put(defaultLabel, i);
+ }
+
+ public ConfusionMatrix(Matrix m) {
+ confusionMatrix = new int[m.numRows()][m.numRows()];
+ setMatrix(m);
+ }
+
+ public int[][] getConfusionMatrix() {
+ return confusionMatrix;
+ }
+
+ public Collection<String> getLabels() {
+ return Collections.unmodifiableCollection(labelMap.keySet());
+ }
+
+ private int numLabels() {
+ return labelMap.size();
+ }
+
+ public double getAccuracy(String label) {
+ int labelId = labelMap.get(label);
+ int labelTotal = 0;
+ int correct = 0;
+ for (int i = 0; i < numLabels(); i++) {
+ labelTotal += confusionMatrix[labelId][i];
+ if (i == labelId) {
+ correct += confusionMatrix[labelId][i];
+ }
+ }
+ return 100.0 * correct / labelTotal;
+ }
+
+ // Producer accuracy
+ public double getAccuracy() {
+ int total = 0;
+ int correct = 0;
+ for (int i = 0; i < numLabels(); i++) {
+ for (int j = 0; j < numLabels(); j++) {
+ total += confusionMatrix[i][j];
+ if (i == j) {
+ correct += confusionMatrix[i][j];
+ }
+ }
+ }
+ return 100.0 * correct / total;
+ }
+
+ /** Sum of true positives and false negatives */
+ private int getActualNumberOfTestExamplesForClass(String label) {
+ int labelId = labelMap.get(label);
+ int sum = 0;
+ for (int i = 0; i < numLabels(); i++) {
+ sum += confusionMatrix[labelId][i];
+ }
+ return sum;
+ }
+
+ public double getPrecision(String label) {
+ int labelId = labelMap.get(label);
+ int truePositives = confusionMatrix[labelId][labelId];
+ int falsePositives = 0;
+ for (int i = 0; i < numLabels(); i++) {
+ if (i == labelId) {
+ continue;
+ }
+ falsePositives += confusionMatrix[i][labelId];
+ }
+
+ if (truePositives + falsePositives == 0) {
+ return 0;
+ }
+
+ return ((double) truePositives) / (truePositives + falsePositives);
+ }
+
+ public double getWeightedPrecision() {
+ double[] precisions = new double[numLabels()];
+ double[] weights = new double[numLabels()];
+
+ int index = 0;
+ for (String label : labelMap.keySet()) {
+ precisions[index] = getPrecision(label);
+ weights[index] = getActualNumberOfTestExamplesForClass(label);
+ index++;
+ }
+ return new Mean().evaluate(precisions, weights);
+ }
+
+ public double getRecall(String label) {
+ int labelId = labelMap.get(label);
+ int truePositives = confusionMatrix[labelId][labelId];
+ int falseNegatives = 0;
+ for (int i = 0; i < numLabels(); i++) {
+ if (i == labelId) {
+ continue;
+ }
+ falseNegatives += confusionMatrix[labelId][i];
+ }
+ if (truePositives + falseNegatives == 0) {
+ return 0;
+ }
+ return ((double) truePositives) / (truePositives + falseNegatives);
+ }
+
+ public double getWeightedRecall() {
+ double[] recalls = new double[numLabels()];
+ double[] weights = new double[numLabels()];
+
+ int index = 0;
+ for (String label : labelMap.keySet()) {
+ recalls[index] = getRecall(label);
+ weights[index] = getActualNumberOfTestExamplesForClass(label);
+ index++;
+ }
+ return new Mean().evaluate(recalls, weights);
+ }
+
+ public double getF1score(String label) {
+ double precision = getPrecision(label);
+ double recall = getRecall(label);
+ if (precision + recall == 0) {
+ return 0;
+ }
+ return 2 * precision * recall / (precision + recall);
+ }
+
+ public double getWeightedF1score() {
+ double[] f1Scores = new double[numLabels()];
+ double[] weights = new double[numLabels()];
+
+ int index = 0;
+ for (String label : labelMap.keySet()) {
+ f1Scores[index] = getF1score(label);
+ weights[index] = getActualNumberOfTestExamplesForClass(label);
+ index++;
+ }
+ return new Mean().evaluate(f1Scores, weights);
+ }
+
+ // User accuracy
+ public double getReliability() {
+ int count = 0;
+ double accuracy = 0;
+ for (String label: labelMap.keySet()) {
+ if (!label.equals(defaultLabel)) {
+ accuracy += getAccuracy(label);
+ }
+ count++;
+ }
+ return accuracy / count;
+ }
+
+ /**
+ * Accuracy v.s. randomly classifying all samples.
+ * kappa() = (totalAccuracy() - randomAccuracy()) / (1 - randomAccuracy())
+ * Cohen, Jacob. 1960. A coefficient of agreement for nominal scales.
+ * Educational And Psychological Measurement 20:37-46.
+ *
+ * Formula and variable names from:
+ * http://www.yale.edu/ceo/OEFS/Accuracy.pdf
+ *
+ * @return double
+ */
+ public double getKappa() {
+ double a = 0.0;
+ double b = 0.0;
+ for (int i = 0; i < confusionMatrix.length; i++) {
+ a += confusionMatrix[i][i];
+ double br = 0;
+ for (int j = 0; j < confusionMatrix.length; j++) {
+ br += confusionMatrix[i][j];
+ }
+ double bc = 0;
+ for (int[] vec : confusionMatrix) {
+ bc += vec[i];
+ }
+ b += br * bc;
+ }
+ return (samples * a - b) / (samples * samples - b);
+ }
+
+ /**
+ * Standard deviation of normalized producer accuracy
+ * Not a standard score
+ * @return double
+ */
+ public RunningAverageAndStdDev getNormalizedStats() {
+ RunningAverageAndStdDev summer = new FullRunningAverageAndStdDev();
+ for (int d = 0; d < confusionMatrix.length; d++) {
+ double total = 0;
+ for (int j = 0; j < confusionMatrix.length; j++) {
+ total += confusionMatrix[d][j];
+ }
+ summer.addDatum(confusionMatrix[d][d] / (total + 0.000001));
+ }
+
+ return summer;
+ }
+
+ public int getCorrect(String label) {
+ int labelId = labelMap.get(label);
+ return confusionMatrix[labelId][labelId];
+ }
+
+ public int getTotal(String label) {
+ int labelId = labelMap.get(label);
+ int labelTotal = 0;
+ for (int i = 0; i < labelMap.size(); i++) {
+ labelTotal += confusionMatrix[labelId][i];
+ }
+ return labelTotal;
+ }
+
+ public void addInstance(String correctLabel, ClassifierResult classifiedResult) {
+ samples++;
+ incrementCount(correctLabel, classifiedResult.getLabel());
+ }
+
+ public void addInstance(String correctLabel, String classifiedLabel) {
+ samples++;
+ incrementCount(correctLabel, classifiedLabel);
+ }
+
+ public int getCount(String correctLabel, String classifiedLabel) {
+ if(!labelMap.containsKey(correctLabel)) {
+ LOG.warn("Label {} did not appear in the training examples", correctLabel);
+ return 0;
+ }
+ Preconditions.checkArgument(labelMap.containsKey(classifiedLabel), "Label not found: " + classifiedLabel);
+ int correctId = labelMap.get(correctLabel);
+ int classifiedId = labelMap.get(classifiedLabel);
+ return confusionMatrix[correctId][classifiedId];
+ }
+
+ public void putCount(String correctLabel, String classifiedLabel, int count) {
+ if(!labelMap.containsKey(correctLabel)) {
+ LOG.warn("Label {} did not appear in the training examples", correctLabel);
+ return;
+ }
+ Preconditions.checkArgument(labelMap.containsKey(classifiedLabel), "Label not found: " + classifiedLabel);
+ int correctId = labelMap.get(correctLabel);
+ int classifiedId = labelMap.get(classifiedLabel);
+ if (confusionMatrix[correctId][classifiedId] == 0.0 && count != 0) {
+ samples++;
+ }
+ confusionMatrix[correctId][classifiedId] = count;
+ }
+
+ public String getDefaultLabel() {
+ return defaultLabel;
+ }
+
+ public void incrementCount(String correctLabel, String classifiedLabel, int count) {
+ putCount(correctLabel, classifiedLabel, count + getCount(correctLabel, classifiedLabel));
+ }
+
+ public void incrementCount(String correctLabel, String classifiedLabel) {
+ incrementCount(correctLabel, classifiedLabel, 1);
+ }
+
+ public ConfusionMatrix merge(ConfusionMatrix b) {
+ Preconditions.checkArgument(labelMap.size() == b.getLabels().size(), "The label sizes do not match");
+ for (String correctLabel : this.labelMap.keySet()) {
+ for (String classifiedLabel : this.labelMap.keySet()) {
+ incrementCount(correctLabel, classifiedLabel, b.getCount(correctLabel, classifiedLabel));
+ }
+ }
+ return this;
+ }
+
+ public Matrix getMatrix() {
+ int length = confusionMatrix.length;
+ Matrix m = new DenseMatrix(length, length);
+ for (int r = 0; r < length; r++) {
+ for (int c = 0; c < length; c++) {
+ m.set(r, c, confusionMatrix[r][c]);
+ }
+ }
+ Map<String,Integer> labels = new HashMap<>();
+ for (Map.Entry<String, Integer> entry : labelMap.entrySet()) {
+ labels.put(entry.getKey(), entry.getValue());
+ }
+ m.setRowLabelBindings(labels);
+ m.setColumnLabelBindings(labels);
+ return m;
+ }
+
+ public void setMatrix(Matrix m) {
+ int length = confusionMatrix.length;
+ if (m.numRows() != m.numCols()) {
+ throw new IllegalArgumentException(
+ "ConfusionMatrix: matrix(" + m.numRows() + ',' + m.numCols() + ") must be square");
+ }
+ for (int r = 0; r < length; r++) {
+ for (int c = 0; c < length; c++) {
+ confusionMatrix[r][c] = (int) Math.round(m.get(r, c));
+ }
+ }
+ Map<String,Integer> labels = m.getRowLabelBindings();
+ if (labels == null) {
+ labels = m.getColumnLabelBindings();
+ }
+ if (labels != null) {
+ String[] sorted = sortLabels(labels);
+ verifyLabels(length, sorted);
+ labelMap.clear();
+ for (int i = 0; i < length; i++) {
+ labelMap.put(sorted[i], i);
+ }
+ }
+ }
+
+ private static String[] sortLabels(Map<String,Integer> labels) {
+ String[] sorted = new String[labels.size()];
+ for (Map.Entry<String,Integer> entry : labels.entrySet()) {
+ sorted[entry.getValue()] = entry.getKey();
+ }
+ return sorted;
+ }
+
+ private static void verifyLabels(int length, String[] sorted) {
+ Preconditions.checkArgument(sorted.length == length, "One label, one row");
+ for (int i = 0; i < length; i++) {
+ if (sorted[i] == null) {
+ Preconditions.checkArgument(false, "One label, one row");
+ }
+ }
+ }
+
+ /**
+ * This is overloaded. toString() is not a formatted report you print for a manager :)
+ * Assume that if there are no default assignments, the default feature was not used
+ */
+ @Override
+ public String toString() {
+ StringBuilder returnString = new StringBuilder(200);
+ returnString.append("=======================================================").append('\n');
+ returnString.append("Confusion Matrix\n");
+ returnString.append("-------------------------------------------------------").append('\n');
+
+ int unclassified = getTotal(defaultLabel);
+ for (Map.Entry<String,Integer> entry : this.labelMap.entrySet()) {
+ if (entry.getKey().equals(defaultLabel) && unclassified == 0) {
+ continue;
+ }
+
+ returnString.append(StringUtils.rightPad(getSmallLabel(entry.getValue()), 5)).append('\t');
+ }
+
+ returnString.append("<--Classified as").append('\n');
+ for (Map.Entry<String,Integer> entry : this.labelMap.entrySet()) {
+ if (entry.getKey().equals(defaultLabel) && unclassified == 0) {
+ continue;
+ }
+ String correctLabel = entry.getKey();
+ int labelTotal = 0;
+ for (String classifiedLabel : this.labelMap.keySet()) {
+ if (classifiedLabel.equals(defaultLabel) && unclassified == 0) {
+ continue;
+ }
+ returnString.append(
+ StringUtils.rightPad(Integer.toString(getCount(correctLabel, classifiedLabel)), 5)).append('\t');
+ labelTotal += getCount(correctLabel, classifiedLabel);
+ }
+ returnString.append(" | ").append(StringUtils.rightPad(String.valueOf(labelTotal), 6)).append('\t')
+ .append(StringUtils.rightPad(getSmallLabel(entry.getValue()), 5))
+ .append(" = ").append(correctLabel).append('\n');
+ }
+ if (unclassified > 0) {
+ returnString.append("Default Category: ").append(defaultLabel).append(": ").append(unclassified).append('\n');
+ }
+ returnString.append('\n');
+ return returnString.toString();
+ }
+
+ static String getSmallLabel(int i) {
+ int val = i;
+ StringBuilder returnString = new StringBuilder();
+ do {
+ int n = val % 26;
+ returnString.insert(0, (char) ('a' + n));
+ val /= 26;
+ } while (val > 0);
+ return returnString.toString();
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/OnlineLearner.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/OnlineLearner.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/OnlineLearner.java
new file mode 100644
index 0000000..af1d5e7
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/OnlineLearner.java
@@ -0,0 +1,96 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier;
+
+import org.apache.mahout.math.Vector;
+
+import java.io.Closeable;
+
+/**
+ * The simplest interface for online learning algorithms.
+ */
+public interface OnlineLearner extends Closeable {
+ /**
+ * Updates the model using a particular target variable value and a feature vector.
+ * <p/>
+ * There may an assumption that if multiple passes through the training data are necessary, then
+ * the training examples will be presented in the same order. This is because the order of
+ * training examples may be used to assign records to different data splits for evaluation by
+ * cross-validation. Without the order invariance, records might be assigned to training and test
+ * splits and error estimates could be seriously affected.
+ * <p/>
+ * If re-ordering is necessary, then using the alternative API which allows a tracking key to be
+ * added to the training example can be used.
+ *
+ * @param actual The value of the target variable. This value should be in the half-open
+ * interval [0..n) where n is the number of target categories.
+ * @param instance The feature vector for this example.
+ */
+ void train(int actual, Vector instance);
+
+ /**
+ * Updates the model using a particular target variable value and a feature vector.
+ * <p/>
+ * There may an assumption that if multiple passes through the training data are necessary that
+ * the tracking key for a record will be the same for each pass and that there will be a
+ * relatively large number of distinct tracking keys and that the low-order bits of the tracking
+ * keys will not correlate with any of the input variables. This tracking key is used to assign
+ * training examples to different test/training splits.
+ * <p/>
+ * Examples of useful tracking keys include id-numbers for the training records derived from
+ * a database id for the base table from the which the record is derived, or the offset of
+ * the original data record in a data file.
+ *
+ * @param trackingKey The tracking key for this training example.
+ * @param groupKey An optional value that allows examples to be grouped in the computation of
+ * the update to the model.
+ * @param actual The value of the target variable. This value should be in the half-open
+ * interval [0..n) where n is the number of target categories.
+ * @param instance The feature vector for this example.
+ */
+ void train(long trackingKey, String groupKey, int actual, Vector instance);
+
+ /**
+ * Updates the model using a particular target variable value and a feature vector.
+ * <p/>
+ * There may an assumption that if multiple passes through the training data are necessary that
+ * the tracking key for a record will be the same for each pass and that there will be a
+ * relatively large number of distinct tracking keys and that the low-order bits of the tracking
+ * keys will not correlate with any of the input variables. This tracking key is used to assign
+ * training examples to different test/training splits.
+ * <p/>
+ * Examples of useful tracking keys include id-numbers for the training records derived from
+ * a database id for the base table from the which the record is derived, or the offset of
+ * the original data record in a data file.
+ *
+ * @param trackingKey The tracking key for this training example.
+ * @param actual The value of the target variable. This value should be in the half-open
+ * interval [0..n) where n is the number of target categories.
+ * @param instance The feature vector for this example.
+ */
+ void train(long trackingKey, int actual, Vector instance);
+
+ /**
+ * Prepares the classifier for classification and deallocates any temporary data structures.
+ *
+ * An online classifier should be able to accept more training after being closed, but
+ * closing the classifier may make classification more efficient.
+ */
+ @Override
+ void close();
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/RegressionResultAnalyzer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/RegressionResultAnalyzer.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/RegressionResultAnalyzer.java
new file mode 100644
index 0000000..35c11ee
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/RegressionResultAnalyzer.java
@@ -0,0 +1,144 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier;
+
+import java.text.DecimalFormat;
+import java.text.NumberFormat;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Locale;
+
+import org.apache.commons.lang3.StringUtils;
+
+/**
+ * ResultAnalyzer captures the classification statistics and displays in a tabular manner
+ */
+public class RegressionResultAnalyzer {
+
+ private static class Result {
+ private final double actual;
+ private final double result;
+ Result(double actual, double result) {
+ this.actual = actual;
+ this.result = result;
+ }
+ double getActual() {
+ return actual;
+ }
+ double getResult() {
+ return result;
+ }
+ }
+
+ private List<Result> results;
+
+ /**
+ *
+ * @param actual
+ * The actual answer
+ * @param result
+ * The regression result
+ */
+ public void addInstance(double actual, double result) {
+ if (results == null) {
+ results = new ArrayList<>();
+ }
+ results.add(new Result(actual, result));
+ }
+
+ /**
+ *
+ * @param results
+ * The results table
+ */
+ public void setInstances(double[][] results) {
+ for (double[] res : results) {
+ addInstance(res[0], res[1]);
+ }
+ }
+
+ @Override
+ public String toString() {
+ double sumActual = 0.0;
+ double sumActualSquared = 0.0;
+ double sumResult = 0.0;
+ double sumResultSquared = 0.0;
+ double sumActualResult = 0.0;
+ double sumAbsolute = 0.0;
+ double sumAbsoluteSquared = 0.0;
+ int predictable = 0;
+ int unpredictable = 0;
+
+ for (Result res : results) {
+ double actual = res.getActual();
+ double result = res.getResult();
+ if (Double.isNaN(result)) {
+ unpredictable++;
+ } else {
+ sumActual += actual;
+ sumActualSquared += actual * actual;
+ sumResult += result;
+ sumResultSquared += result * result;
+ sumActualResult += actual * result;
+ double absolute = Math.abs(actual - result);
+ sumAbsolute += absolute;
+ sumAbsoluteSquared += absolute * absolute;
+ predictable++;
+ }
+ }
+
+ StringBuilder returnString = new StringBuilder();
+
+ returnString.append("=======================================================\n");
+ returnString.append("Summary\n");
+ returnString.append("-------------------------------------------------------\n");
+
+ if (predictable > 0) {
+ double varActual = sumActualSquared - sumActual * sumActual / predictable;
+ double varResult = sumResultSquared - sumResult * sumResult / predictable;
+ double varCo = sumActualResult - sumActual * sumResult / predictable;
+
+ double correlation;
+ if (varActual * varResult <= 0) {
+ correlation = 0.0;
+ } else {
+ correlation = varCo / Math.sqrt(varActual * varResult);
+ }
+
+ Locale.setDefault(Locale.US);
+ NumberFormat decimalFormatter = new DecimalFormat("0.####");
+
+ returnString.append(StringUtils.rightPad("Correlation coefficient", 40)).append(": ").append(
+ StringUtils.leftPad(decimalFormatter.format(correlation), 10)).append('\n');
+ returnString.append(StringUtils.rightPad("Mean absolute error", 40)).append(": ").append(
+ StringUtils.leftPad(decimalFormatter.format(sumAbsolute / predictable), 10)).append('\n');
+ returnString.append(StringUtils.rightPad("Root mean squared error", 40)).append(": ").append(
+ StringUtils.leftPad(decimalFormatter.format(Math.sqrt(sumAbsoluteSquared / predictable)),
+ 10)).append('\n');
+ }
+ returnString.append(StringUtils.rightPad("Predictable Instances", 40)).append(": ").append(
+ StringUtils.leftPad(Integer.toString(predictable), 10)).append('\n');
+ returnString.append(StringUtils.rightPad("Unpredictable Instances", 40)).append(": ").append(
+ StringUtils.leftPad(Integer.toString(unpredictable), 10)).append('\n');
+ returnString.append(StringUtils.rightPad("Total Regressed Instances", 40)).append(": ").append(
+ StringUtils.leftPad(Integer.toString(results.size()), 10)).append('\n');
+ returnString.append('\n');
+
+ return returnString.toString();
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/ResultAnalyzer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/ResultAnalyzer.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/ResultAnalyzer.java
new file mode 100644
index 0000000..1711f19
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/ResultAnalyzer.java
@@ -0,0 +1,132 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier;
+
+import java.text.DecimalFormat;
+import java.text.NumberFormat;
+import java.util.Collection;
+
+import org.apache.commons.lang3.StringUtils;
+import org.apache.mahout.cf.taste.impl.common.RunningAverageAndStdDev;
+import org.apache.mahout.math.stats.OnlineSummarizer;
+
+/** ResultAnalyzer captures the classification statistics and displays in a tabular manner */
+public class ResultAnalyzer {
+
+ private final ConfusionMatrix confusionMatrix;
+ private final OnlineSummarizer summarizer;
+ private boolean hasLL;
+
+ /*
+ * === Summary ===
+ *
+ * Correctly Classified Instances 635 92.9722 % Incorrectly Classified Instances 48 7.0278 % Kappa statistic
+ * 0.923 Mean absolute error 0.0096 Root mean squared error 0.0817 Relative absolute error 9.9344 % Root
+ * relative squared error 37.2742 % Total Number of Instances 683
+ */
+ private int correctlyClassified;
+ private int incorrectlyClassified;
+
+ public ResultAnalyzer(Collection<String> labelSet, String defaultLabel) {
+ confusionMatrix = new ConfusionMatrix(labelSet, defaultLabel);
+ summarizer = new OnlineSummarizer();
+ }
+
+ public ConfusionMatrix getConfusionMatrix() {
+ return this.confusionMatrix;
+ }
+
+ /**
+ *
+ * @param correctLabel
+ * The correct label
+ * @param classifiedResult
+ * The classified result
+ * @return whether the instance was correct or not
+ */
+ public boolean addInstance(String correctLabel, ClassifierResult classifiedResult) {
+ boolean result = correctLabel.equals(classifiedResult.getLabel());
+ if (result) {
+ correctlyClassified++;
+ } else {
+ incorrectlyClassified++;
+ }
+ confusionMatrix.addInstance(correctLabel, classifiedResult);
+ if (classifiedResult.getLogLikelihood() != Double.MAX_VALUE) {
+ summarizer.add(classifiedResult.getLogLikelihood());
+ hasLL = true;
+ }
+ return result;
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder returnString = new StringBuilder();
+
+ returnString.append('\n');
+ returnString.append("=======================================================\n");
+ returnString.append("Summary\n");
+ returnString.append("-------------------------------------------------------\n");
+ int totalClassified = correctlyClassified + incorrectlyClassified;
+ double percentageCorrect = (double) 100 * correctlyClassified / totalClassified;
+ double percentageIncorrect = (double) 100 * incorrectlyClassified / totalClassified;
+ NumberFormat decimalFormatter = new DecimalFormat("0.####");
+
+ returnString.append(StringUtils.rightPad("Correctly Classified Instances", 40)).append(": ").append(
+ StringUtils.leftPad(Integer.toString(correctlyClassified), 10)).append('\t').append(
+ StringUtils.leftPad(decimalFormatter.format(percentageCorrect), 10)).append("%\n");
+ returnString.append(StringUtils.rightPad("Incorrectly Classified Instances", 40)).append(": ").append(
+ StringUtils.leftPad(Integer.toString(incorrectlyClassified), 10)).append('\t').append(
+ StringUtils.leftPad(decimalFormatter.format(percentageIncorrect), 10)).append("%\n");
+ returnString.append(StringUtils.rightPad("Total Classified Instances", 40)).append(": ").append(
+ StringUtils.leftPad(Integer.toString(totalClassified), 10)).append('\n');
+ returnString.append('\n');
+
+ returnString.append(confusionMatrix);
+ returnString.append("=======================================================\n");
+ returnString.append("Statistics\n");
+ returnString.append("-------------------------------------------------------\n");
+
+ RunningAverageAndStdDev normStats = confusionMatrix.getNormalizedStats();
+ returnString.append(StringUtils.rightPad("Kappa", 40)).append(
+ StringUtils.leftPad(decimalFormatter.format(confusionMatrix.getKappa()), 10)).append('\n');
+ returnString.append(StringUtils.rightPad("Accuracy", 40)).append(
+ StringUtils.leftPad(decimalFormatter.format(confusionMatrix.getAccuracy()), 10)).append("%\n");
+ returnString.append(StringUtils.rightPad("Reliability", 40)).append(
+ StringUtils.leftPad(decimalFormatter.format(normStats.getAverage() * 100.00000001), 10)).append("%\n");
+ returnString.append(StringUtils.rightPad("Reliability (standard deviation)", 40)).append(
+ StringUtils.leftPad(decimalFormatter.format(normStats.getStandardDeviation()), 10)).append('\n');
+ returnString.append(StringUtils.rightPad("Weighted precision", 40)).append(
+ StringUtils.leftPad(decimalFormatter.format(confusionMatrix.getWeightedPrecision()), 10)).append('\n');
+ returnString.append(StringUtils.rightPad("Weighted recall", 40)).append(
+ StringUtils.leftPad(decimalFormatter.format(confusionMatrix.getWeightedRecall()), 10)).append('\n');
+ returnString.append(StringUtils.rightPad("Weighted F1 score", 40)).append(
+ StringUtils.leftPad(decimalFormatter.format(confusionMatrix.getWeightedF1score()), 10)).append('\n');
+
+ if (hasLL) {
+ returnString.append(StringUtils.rightPad("Log-likelihood", 30)).append("mean : ").append(
+ StringUtils.leftPad(decimalFormatter.format(summarizer.getMean()), 10)).append('\n');
+ returnString.append(StringUtils.rightPad("", 30)).append(StringUtils.rightPad("25%-ile : ", 10)).append(
+ StringUtils.leftPad(decimalFormatter.format(summarizer.getQuartile(1)), 10)).append('\n');
+ returnString.append(StringUtils.rightPad("", 30)).append(StringUtils.rightPad("75%-ile : ", 10)).append(
+ StringUtils.leftPad(decimalFormatter.format(summarizer.getQuartile(3)), 10)).append('\n');
+ }
+
+ return returnString.toString();
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/Bagging.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/Bagging.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/Bagging.java
new file mode 100644
index 0000000..f79a429
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/Bagging.java
@@ -0,0 +1,61 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df;
+
+import org.apache.mahout.classifier.df.builder.TreeBuilder;
+import org.apache.mahout.classifier.df.data.Data;
+import org.apache.mahout.classifier.df.node.Node;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.Arrays;
+import java.util.Random;
+
+/**
+ * Builds a tree using bagging
+ */
+@Deprecated
+public class Bagging {
+
+ private static final Logger log = LoggerFactory.getLogger(Bagging.class);
+
+ private final TreeBuilder treeBuilder;
+
+ private final Data data;
+
+ private final boolean[] sampled;
+
+ public Bagging(TreeBuilder treeBuilder, Data data) {
+ this.treeBuilder = treeBuilder;
+ this.data = data;
+ sampled = new boolean[data.size()];
+ }
+
+ /**
+ * Builds one tree
+ */
+ public Node build(Random rng) {
+ log.debug("Bagging...");
+ Arrays.fill(sampled, false);
+ Data bag = data.bagging(rng, sampled);
+
+ log.debug("Building...");
+ return treeBuilder.build(rng, bag);
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/DFUtils.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/DFUtils.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/DFUtils.java
new file mode 100644
index 0000000..c94292c
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/DFUtils.java
@@ -0,0 +1,174 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.classifier.df.node.Node;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
+
+/**
+ * Utility class that contains various helper methods
+ */
+@Deprecated
+public final class DFUtils {
+
+ private DFUtils() {
+ }
+
+ /**
+ * Writes an Node[] into a DataOutput
+ * @throws java.io.IOException
+ */
+ public static void writeArray(DataOutput out, Node[] array) throws IOException {
+ out.writeInt(array.length);
+ for (Node w : array) {
+ w.write(out);
+ }
+ }
+
+ /**
+ * Reads a Node[] from a DataInput
+ * @throws java.io.IOException
+ */
+ public static Node[] readNodeArray(DataInput in) throws IOException {
+ int length = in.readInt();
+ Node[] nodes = new Node[length];
+ for (int index = 0; index < length; index++) {
+ nodes[index] = Node.read(in);
+ }
+
+ return nodes;
+ }
+
+ /**
+ * Writes a double[] into a DataOutput
+ * @throws java.io.IOException
+ */
+ public static void writeArray(DataOutput out, double[] array) throws IOException {
+ out.writeInt(array.length);
+ for (double value : array) {
+ out.writeDouble(value);
+ }
+ }
+
+ /**
+ * Reads a double[] from a DataInput
+ * @throws java.io.IOException
+ */
+ public static double[] readDoubleArray(DataInput in) throws IOException {
+ int length = in.readInt();
+ double[] array = new double[length];
+ for (int index = 0; index < length; index++) {
+ array[index] = in.readDouble();
+ }
+
+ return array;
+ }
+
+ /**
+ * Writes an int[] into a DataOutput
+ * @throws java.io.IOException
+ */
+ public static void writeArray(DataOutput out, int[] array) throws IOException {
+ out.writeInt(array.length);
+ for (int value : array) {
+ out.writeInt(value);
+ }
+ }
+
+ /**
+ * Reads an int[] from a DataInput
+ * @throws java.io.IOException
+ */
+ public static int[] readIntArray(DataInput in) throws IOException {
+ int length = in.readInt();
+ int[] array = new int[length];
+ for (int index = 0; index < length; index++) {
+ array[index] = in.readInt();
+ }
+
+ return array;
+ }
+
+ /**
+ * Return a list of all files in the output directory
+ * @throws IOException if no file is found
+ */
+ public static Path[] listOutputFiles(FileSystem fs, Path outputPath) throws IOException {
+ List<Path> outputFiles = new ArrayList<>();
+ for (FileStatus s : fs.listStatus(outputPath, PathFilters.logsCRCFilter())) {
+ if (!s.isDir() && !s.getPath().getName().startsWith("_")) {
+ outputFiles.add(s.getPath());
+ }
+ }
+ if (outputFiles.isEmpty()) {
+ throw new IOException("No output found !");
+ }
+ return outputFiles.toArray(new Path[outputFiles.size()]);
+ }
+
+ /**
+ * Formats a time interval in milliseconds to a String in the form "hours:minutes:seconds:millis"
+ */
+ public static String elapsedTime(long milli) {
+ long seconds = milli / 1000;
+ milli %= 1000;
+
+ long minutes = seconds / 60;
+ seconds %= 60;
+
+ long hours = minutes / 60;
+ minutes %= 60;
+
+ return hours + "h " + minutes + "m " + seconds + "s " + milli;
+ }
+
+ public static void storeWritable(Configuration conf, Path path, Writable writable) throws IOException {
+ FileSystem fs = path.getFileSystem(conf);
+
+ try (FSDataOutputStream out = fs.create(path)) {
+ writable.write(out);
+ }
+ }
+
+ /**
+ * Write a string to a path.
+ * @param conf From which the file system will be picked
+ * @param path Where the string will be written
+ * @param string The string to write
+ * @throws IOException if things go poorly
+ */
+ public static void storeString(Configuration conf, Path path, String string) throws IOException {
+ try (DataOutputStream out = path.getFileSystem(conf).create(path)) {
+ out.write(string.getBytes(Charset.defaultCharset()));
+ }
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/DecisionForest.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/DecisionForest.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/DecisionForest.java
new file mode 100644
index 0000000..c11cf34
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/DecisionForest.java
@@ -0,0 +1,241 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df;
+
+import com.google.common.base.Preconditions;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.classifier.df.data.Data;
+import org.apache.mahout.classifier.df.data.DataUtils;
+import org.apache.mahout.classifier.df.data.Dataset;
+import org.apache.mahout.classifier.df.data.Instance;
+import org.apache.mahout.classifier.df.node.Node;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Random;
+
+/**
+ * Represents a forest of decision trees.
+ */
+@Deprecated
+public class DecisionForest implements Writable {
+
+ private final List<Node> trees;
+
+ private DecisionForest() {
+ trees = new ArrayList<>();
+ }
+
+ public DecisionForest(List<Node> trees) {
+ Preconditions.checkArgument(trees != null && !trees.isEmpty(), "trees argument must not be null or empty");
+
+ this.trees = trees;
+ }
+
+ List<Node> getTrees() {
+ return trees;
+ }
+
+ /**
+ * Classifies the data and calls callback for each classification
+ */
+ public void classify(Data data, double[][] predictions) {
+ Preconditions.checkArgument(data.size() == predictions.length, "predictions.length must be equal to data.size()");
+
+ if (data.isEmpty()) {
+ return; // nothing to classify
+ }
+
+ int treeId = 0;
+ for (Node tree : trees) {
+ for (int index = 0; index < data.size(); index++) {
+ if (predictions[index] == null) {
+ predictions[index] = new double[trees.size()];
+ }
+ predictions[index][treeId] = tree.classify(data.get(index));
+ }
+ treeId++;
+ }
+ }
+
+ /**
+ * predicts the label for the instance
+ *
+ * @param rng
+ * Random number generator, used to break ties randomly
+ * @return NaN if the label cannot be predicted
+ */
+ public double classify(Dataset dataset, Random rng, Instance instance) {
+ if (dataset.isNumerical(dataset.getLabelId())) {
+ double sum = 0;
+ int cnt = 0;
+ for (Node tree : trees) {
+ double prediction = tree.classify(instance);
+ if (!Double.isNaN(prediction)) {
+ sum += prediction;
+ cnt++;
+ }
+ }
+
+ if (cnt > 0) {
+ return sum / cnt;
+ } else {
+ return Double.NaN;
+ }
+ } else {
+ int[] predictions = new int[dataset.nblabels()];
+ for (Node tree : trees) {
+ double prediction = tree.classify(instance);
+ if (!Double.isNaN(prediction)) {
+ predictions[(int) prediction]++;
+ }
+ }
+
+ if (DataUtils.sum(predictions) == 0) {
+ return Double.NaN; // no prediction available
+ }
+
+ return DataUtils.maxindex(rng, predictions);
+ }
+ }
+
+ /**
+ * @return Mean number of nodes per tree
+ */
+ public long meanNbNodes() {
+ long sum = 0;
+
+ for (Node tree : trees) {
+ sum += tree.nbNodes();
+ }
+
+ return sum / trees.size();
+ }
+
+ /**
+ * @return Total number of nodes in all the trees
+ */
+ public long nbNodes() {
+ long sum = 0;
+
+ for (Node tree : trees) {
+ sum += tree.nbNodes();
+ }
+
+ return sum;
+ }
+
+ /**
+ * @return Mean maximum depth per tree
+ */
+ public long meanMaxDepth() {
+ long sum = 0;
+
+ for (Node tree : trees) {
+ sum += tree.maxDepth();
+ }
+
+ return sum / trees.size();
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj) {
+ return true;
+ }
+ if (!(obj instanceof DecisionForest)) {
+ return false;
+ }
+
+ DecisionForest rf = (DecisionForest) obj;
+
+ return trees.size() == rf.getTrees().size() && trees.containsAll(rf.getTrees());
+ }
+
+ @Override
+ public int hashCode() {
+ return trees.hashCode();
+ }
+
+ @Override
+ public void write(DataOutput dataOutput) throws IOException {
+ dataOutput.writeInt(trees.size());
+ for (Node tree : trees) {
+ tree.write(dataOutput);
+ }
+ }
+
+ /**
+ * Reads the trees from the input and adds them to the existing trees
+ */
+ @Override
+ public void readFields(DataInput dataInput) throws IOException {
+ int size = dataInput.readInt();
+ for (int i = 0; i < size; i++) {
+ trees.add(Node.read(dataInput));
+ }
+ }
+
+ /**
+ * Read the forest from inputStream
+ * @param dataInput - input forest
+ * @return {@link org.apache.mahout.classifier.df.DecisionForest}
+ * @throws IOException
+ */
+ public static DecisionForest read(DataInput dataInput) throws IOException {
+ DecisionForest forest = new DecisionForest();
+ forest.readFields(dataInput);
+ return forest;
+ }
+
+ /**
+ * Load the forest from a single file or a directory of files
+ * @throws java.io.IOException
+ */
+ public static DecisionForest load(Configuration conf, Path forestPath) throws IOException {
+ FileSystem fs = forestPath.getFileSystem(conf);
+ Path[] files;
+ if (fs.getFileStatus(forestPath).isDir()) {
+ files = DFUtils.listOutputFiles(fs, forestPath);
+ } else {
+ files = new Path[]{forestPath};
+ }
+
+ DecisionForest forest = null;
+ for (Path path : files) {
+ try (FSDataInputStream dataInput = new FSDataInputStream(fs.open(path))) {
+ if (forest == null) {
+ forest = read(dataInput);
+ } else {
+ forest.readFields(dataInput);
+ }
+ }
+ }
+
+ return forest;
+
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/ErrorEstimate.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/ErrorEstimate.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/ErrorEstimate.java
new file mode 100644
index 0000000..13cd386
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/classifier/df/ErrorEstimate.java
@@ -0,0 +1,51 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.df;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * Various methods to compute from the output of a random forest
+ */
+@Deprecated
+public final class ErrorEstimate {
+
+ private ErrorEstimate() {
+ }
+
+ public static double errorRate(double[] labels, double[] predictions) {
+ Preconditions.checkArgument(labels.length == predictions.length, "labels.length != predictions.length");
+ double nberrors = 0; // number of instance that got bad predictions
+ double datasize = 0; // number of classified instances
+
+ for (int index = 0; index < labels.length; index++) {
+ if (predictions[index] == -1) {
+ continue; // instance not classified
+ }
+
+ if (predictions[index] != labels[index]) {
+ nberrors++;
+ }
+
+ datasize++;
+ }
+
+ return nberrors / datasize;
+ }
+
+}
r***@apache.org
2018-06-28 14:55:03 UTC
Permalink
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/ParallelArraysSGDFactorizer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/ParallelArraysSGDFactorizer.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/ParallelArraysSGDFactorizer.java
new file mode 100644
index 0000000..a99d54c
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/ParallelArraysSGDFactorizer.java
@@ -0,0 +1,265 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.kddcup.track1.svd;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
+import org.apache.mahout.cf.taste.impl.common.FullRunningAverage;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+import org.apache.mahout.cf.taste.impl.common.RunningAverage;
+import org.apache.mahout.cf.taste.impl.recommender.svd.Factorization;
+import org.apache.mahout.cf.taste.impl.recommender.svd.Factorizer;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.Preference;
+import org.apache.mahout.common.RandomUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.Collection;
+import java.util.Random;
+
+/**
+ * {@link Factorizer} based on Simon Funk's famous article <a href="http://sifter.org/~simon/journal/20061211.html">
+ * "Netflix Update: Try this at home"</a>.
+ *
+ * Attempts to be as memory efficient as possible, only iterating once through the
+ * {@link FactorizablePreferences} or {@link DataModel} while copying everything to primitive arrays.
+ * Learning works in place on these datastructures after that.
+ */
+public class ParallelArraysSGDFactorizer implements Factorizer {
+
+ public static final double DEFAULT_LEARNING_RATE = 0.005;
+ public static final double DEFAULT_PREVENT_OVERFITTING = 0.02;
+ public static final double DEFAULT_RANDOM_NOISE = 0.005;
+
+ private final int numFeatures;
+ private final int numIterations;
+ private final float minPreference;
+ private final float maxPreference;
+
+ private final Random random;
+ private final double learningRate;
+ private final double preventOverfitting;
+
+ private final FastByIDMap<Integer> userIDMapping;
+ private final FastByIDMap<Integer> itemIDMapping;
+
+ private final double[][] userFeatures;
+ private final double[][] itemFeatures;
+
+ private final int[] userIndexes;
+ private final int[] itemIndexes;
+ private final float[] values;
+
+ private final double defaultValue;
+ private final double interval;
+ private final double[] cachedEstimates;
+
+
+ private static final Logger log = LoggerFactory.getLogger(ParallelArraysSGDFactorizer.class);
+
+ public ParallelArraysSGDFactorizer(DataModel dataModel, int numFeatures, int numIterations) {
+ this(new DataModelFactorizablePreferences(dataModel), numFeatures, numIterations, DEFAULT_LEARNING_RATE,
+ DEFAULT_PREVENT_OVERFITTING, DEFAULT_RANDOM_NOISE);
+ }
+
+ public ParallelArraysSGDFactorizer(DataModel dataModel, int numFeatures, int numIterations, double learningRate,
+ double preventOverfitting, double randomNoise) {
+ this(new DataModelFactorizablePreferences(dataModel), numFeatures, numIterations, learningRate, preventOverfitting,
+ randomNoise);
+ }
+
+ public ParallelArraysSGDFactorizer(FactorizablePreferences factorizablePrefs, int numFeatures, int numIterations) {
+ this(factorizablePrefs, numFeatures, numIterations, DEFAULT_LEARNING_RATE, DEFAULT_PREVENT_OVERFITTING,
+ DEFAULT_RANDOM_NOISE);
+ }
+
+ public ParallelArraysSGDFactorizer(FactorizablePreferences factorizablePreferences, int numFeatures,
+ int numIterations, double learningRate, double preventOverfitting, double randomNoise) {
+
+ this.numFeatures = numFeatures;
+ this.numIterations = numIterations;
+ minPreference = factorizablePreferences.getMinPreference();
+ maxPreference = factorizablePreferences.getMaxPreference();
+
+ this.random = RandomUtils.getRandom();
+ this.learningRate = learningRate;
+ this.preventOverfitting = preventOverfitting;
+
+ int numUsers = factorizablePreferences.numUsers();
+ int numItems = factorizablePreferences.numItems();
+ int numPrefs = factorizablePreferences.numPreferences();
+
+ log.info("Mapping {} users...", numUsers);
+ userIDMapping = new FastByIDMap<>(numUsers);
+ int index = 0;
+ LongPrimitiveIterator userIterator = factorizablePreferences.getUserIDs();
+ while (userIterator.hasNext()) {
+ userIDMapping.put(userIterator.nextLong(), index++);
+ }
+
+ log.info("Mapping {} items", numItems);
+ itemIDMapping = new FastByIDMap<>(numItems);
+ index = 0;
+ LongPrimitiveIterator itemIterator = factorizablePreferences.getItemIDs();
+ while (itemIterator.hasNext()) {
+ itemIDMapping.put(itemIterator.nextLong(), index++);
+ }
+
+ this.userIndexes = new int[numPrefs];
+ this.itemIndexes = new int[numPrefs];
+ this.values = new float[numPrefs];
+ this.cachedEstimates = new double[numPrefs];
+
+ index = 0;
+ log.info("Loading {} preferences into memory", numPrefs);
+ RunningAverage average = new FullRunningAverage();
+ for (Preference preference : factorizablePreferences.getPreferences()) {
+ userIndexes[index] = userIDMapping.get(preference.getUserID());
+ itemIndexes[index] = itemIDMapping.get(preference.getItemID());
+ values[index] = preference.getValue();
+ cachedEstimates[index] = 0;
+
+ average.addDatum(preference.getValue());
+
+ index++;
+ if (index % 1000000 == 0) {
+ log.info("Processed {} preferences", index);
+ }
+ }
+ log.info("Processed {} preferences, done.", index);
+
+ double averagePreference = average.getAverage();
+ log.info("Average preference value is {}", averagePreference);
+
+ double prefInterval = factorizablePreferences.getMaxPreference() - factorizablePreferences.getMinPreference();
+ defaultValue = Math.sqrt((averagePreference - prefInterval * 0.1) / numFeatures);
+ interval = prefInterval * 0.1 / numFeatures;
+
+ userFeatures = new double[numUsers][numFeatures];
+ itemFeatures = new double[numItems][numFeatures];
+
+ log.info("Initializing feature vectors...");
+ for (int feature = 0; feature < numFeatures; feature++) {
+ for (int userIndex = 0; userIndex < numUsers; userIndex++) {
+ userFeatures[userIndex][feature] = defaultValue + (random.nextDouble() - 0.5) * interval * randomNoise;
+ }
+ for (int itemIndex = 0; itemIndex < numItems; itemIndex++) {
+ itemFeatures[itemIndex][feature] = defaultValue + (random.nextDouble() - 0.5) * interval * randomNoise;
+ }
+ }
+ }
+
+ @Override
+ public Factorization factorize() throws TasteException {
+ for (int feature = 0; feature < numFeatures; feature++) {
+ log.info("Shuffling preferences...");
+ shufflePreferences();
+ log.info("Starting training of feature {} ...", feature);
+ for (int currentIteration = 0; currentIteration < numIterations; currentIteration++) {
+ if (currentIteration == numIterations - 1) {
+ double rmse = trainingIterationWithRmse(feature);
+ log.info("Finished training feature {} with RMSE {}", feature, rmse);
+ } else {
+ trainingIteration(feature);
+ }
+ }
+ if (feature < numFeatures - 1) {
+ log.info("Updating cache...");
+ for (int index = 0; index < userIndexes.length; index++) {
+ cachedEstimates[index] = estimate(userIndexes[index], itemIndexes[index], feature, cachedEstimates[index],
+ false);
+ }
+ }
+ }
+ log.info("Factorization done");
+ return new Factorization(userIDMapping, itemIDMapping, userFeatures, itemFeatures);
+ }
+
+ private void trainingIteration(int feature) {
+ for (int index = 0; index < userIndexes.length; index++) {
+ train(userIndexes[index], itemIndexes[index], feature, values[index], cachedEstimates[index]);
+ }
+ }
+
+ private double trainingIterationWithRmse(int feature) {
+ double rmse = 0.0;
+ for (int index = 0; index < userIndexes.length; index++) {
+ double error = train(userIndexes[index], itemIndexes[index], feature, values[index], cachedEstimates[index]);
+ rmse += error * error;
+ }
+ return Math.sqrt(rmse / userIndexes.length);
+ }
+
+ private double estimate(int userIndex, int itemIndex, int feature, double cachedEstimate, boolean trailing) {
+ double sum = cachedEstimate;
+ sum += userFeatures[userIndex][feature] * itemFeatures[itemIndex][feature];
+ if (trailing) {
+ sum += (numFeatures - feature - 1) * (defaultValue + interval) * (defaultValue + interval);
+ if (sum > maxPreference) {
+ sum = maxPreference;
+ } else if (sum < minPreference) {
+ sum = minPreference;
+ }
+ }
+ return sum;
+ }
+
+ public double train(int userIndex, int itemIndex, int feature, double original, double cachedEstimate) {
+ double error = original - estimate(userIndex, itemIndex, feature, cachedEstimate, true);
+ double[] userVector = userFeatures[userIndex];
+ double[] itemVector = itemFeatures[itemIndex];
+
+ userVector[feature] += learningRate * (error * itemVector[feature] - preventOverfitting * userVector[feature]);
+ itemVector[feature] += learningRate * (error * userVector[feature] - preventOverfitting * itemVector[feature]);
+
+ return error;
+ }
+
+ protected void shufflePreferences() {
+ /* Durstenfeld shuffle */
+ for (int currentPos = userIndexes.length - 1; currentPos > 0; currentPos--) {
+ int swapPos = random.nextInt(currentPos + 1);
+ swapPreferences(currentPos, swapPos);
+ }
+ }
+
+ private void swapPreferences(int posA, int posB) {
+ int tmpUserIndex = userIndexes[posA];
+ int tmpItemIndex = itemIndexes[posA];
+ float tmpValue = values[posA];
+ double tmpEstimate = cachedEstimates[posA];
+
+ userIndexes[posA] = userIndexes[posB];
+ itemIndexes[posA] = itemIndexes[posB];
+ values[posA] = values[posB];
+ cachedEstimates[posA] = cachedEstimates[posB];
+
+ userIndexes[posB] = tmpUserIndex;
+ itemIndexes[posB] = tmpItemIndex;
+ values[posB] = tmpValue;
+ cachedEstimates[posB] = tmpEstimate;
+ }
+
+ @Override
+ public void refresh(Collection<Refreshable> alreadyRefreshed) {
+ // do nothing
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/Track1SVDRunner.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/Track1SVDRunner.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/Track1SVDRunner.java
new file mode 100644
index 0000000..5cce02d
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/Track1SVDRunner.java
@@ -0,0 +1,141 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.kddcup.track1.svd;
+
+import java.io.BufferedOutputStream;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.OutputStream;
+
+import com.google.common.io.Closeables;
+import org.apache.mahout.cf.taste.common.NoSuchItemException;
+import org.apache.mahout.cf.taste.common.NoSuchUserException;
+import org.apache.mahout.cf.taste.example.kddcup.DataFileIterable;
+import org.apache.mahout.cf.taste.example.kddcup.KDDCupDataModel;
+import org.apache.mahout.cf.taste.example.kddcup.track1.EstimateConverter;
+import org.apache.mahout.cf.taste.impl.common.FullRunningAverage;
+import org.apache.mahout.cf.taste.impl.common.RunningAverage;
+import org.apache.mahout.cf.taste.impl.recommender.svd.Factorization;
+import org.apache.mahout.cf.taste.impl.recommender.svd.Factorizer;
+import org.apache.mahout.cf.taste.model.Preference;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.common.Pair;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * run an SVD factorization of the KDD track1 data.
+ *
+ * needs at least 6-7GB of memory, tested with -Xms6700M -Xmx6700M
+ *
+ */
+public final class Track1SVDRunner {
+
+ private static final Logger log = LoggerFactory.getLogger(Track1SVDRunner.class);
+
+ private Track1SVDRunner() {
+ }
+
+ public static void main(String[] args) throws Exception {
+
+ if (args.length != 2) {
+ System.err.println("Necessary arguments: <kddDataFileDirectory> <resultFile>");
+ return;
+ }
+
+ File dataFileDirectory = new File(args[0]);
+ if (!dataFileDirectory.exists() || !dataFileDirectory.isDirectory()) {
+ throw new IllegalArgumentException("Bad data file directory: " + dataFileDirectory);
+ }
+
+ File resultFile = new File(args[1]);
+
+ /* the knobs to turn */
+ int numFeatures = 20;
+ int numIterations = 5;
+ double learningRate = 0.0001;
+ double preventOverfitting = 0.002;
+ double randomNoise = 0.0001;
+
+
+ KDDCupFactorizablePreferences factorizablePreferences =
+ new KDDCupFactorizablePreferences(KDDCupDataModel.getTrainingFile(dataFileDirectory));
+
+ Factorizer sgdFactorizer = new ParallelArraysSGDFactorizer(factorizablePreferences, numFeatures, numIterations,
+ learningRate, preventOverfitting, randomNoise);
+
+ Factorization factorization = sgdFactorizer.factorize();
+
+ log.info("Estimating validation preferences...");
+ int prefsProcessed = 0;
+ RunningAverage average = new FullRunningAverage();
+ for (Pair<PreferenceArray,long[]> validationPair
+ : new DataFileIterable(KDDCupDataModel.getValidationFile(dataFileDirectory))) {
+ for (Preference validationPref : validationPair.getFirst()) {
+ double estimate = estimatePreference(factorization, validationPref.getUserID(), validationPref.getItemID(),
+ factorizablePreferences.getMinPreference(), factorizablePreferences.getMaxPreference());
+ double error = validationPref.getValue() - estimate;
+ average.addDatum(error * error);
+ prefsProcessed++;
+ if (prefsProcessed % 100000 == 0) {
+ log.info("Computed {} estimations", prefsProcessed);
+ }
+ }
+ }
+ log.info("Computed {} estimations, done.", prefsProcessed);
+
+ double rmse = Math.sqrt(average.getAverage());
+ log.info("RMSE {}", rmse);
+
+ log.info("Estimating test preferences...");
+ OutputStream out = null;
+ try {
+ out = new BufferedOutputStream(new FileOutputStream(resultFile));
+
+ for (Pair<PreferenceArray,long[]> testPair
+ : new DataFileIterable(KDDCupDataModel.getTestFile(dataFileDirectory))) {
+ for (Preference testPref : testPair.getFirst()) {
+ double estimate = estimatePreference(factorization, testPref.getUserID(), testPref.getItemID(),
+ factorizablePreferences.getMinPreference(), factorizablePreferences.getMaxPreference());
+ byte result = EstimateConverter.convert(estimate, testPref.getUserID(), testPref.getItemID());
+ out.write(result);
+ }
+ }
+ } finally {
+ Closeables.close(out, false);
+ }
+ log.info("wrote estimates to {}, done.", resultFile.getAbsolutePath());
+ }
+
+ static double estimatePreference(Factorization factorization, long userID, long itemID, float minPreference,
+ float maxPreference) throws NoSuchUserException, NoSuchItemException {
+ double[] userFeatures = factorization.getUserFeatures(userID);
+ double[] itemFeatures = factorization.getItemFeatures(itemID);
+ double estimate = 0;
+ for (int feature = 0; feature < userFeatures.length; feature++) {
+ estimate += userFeatures[feature] * itemFeatures[feature];
+ }
+ if (estimate < minPreference) {
+ estimate = minPreference;
+ } else if (estimate > maxPreference) {
+ estimate = maxPreference;
+ }
+ return estimate;
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/HybridSimilarity.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/HybridSimilarity.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/HybridSimilarity.java
new file mode 100644
index 0000000..ce025a9
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/HybridSimilarity.java
@@ -0,0 +1,62 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.kddcup.track2;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Collection;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.similarity.AbstractItemSimilarity;
+import org.apache.mahout.cf.taste.impl.similarity.LogLikelihoodSimilarity;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.similarity.ItemSimilarity;
+
+final class HybridSimilarity extends AbstractItemSimilarity {
+
+ private final ItemSimilarity cfSimilarity;
+ private final ItemSimilarity contentSimilarity;
+
+ HybridSimilarity(DataModel dataModel, File dataFileDirectory) throws IOException {
+ super(dataModel);
+ cfSimilarity = new LogLikelihoodSimilarity(dataModel);
+ contentSimilarity = new TrackItemSimilarity(dataFileDirectory);
+ }
+
+ @Override
+ public double itemSimilarity(long itemID1, long itemID2) throws TasteException {
+ return contentSimilarity.itemSimilarity(itemID1, itemID2) * cfSimilarity.itemSimilarity(itemID1, itemID2);
+ }
+
+ @Override
+ public double[] itemSimilarities(long itemID1, long[] itemID2s) throws TasteException {
+ double[] result = contentSimilarity.itemSimilarities(itemID1, itemID2s);
+ double[] multipliers = cfSimilarity.itemSimilarities(itemID1, itemID2s);
+ for (int i = 0; i < result.length; i++) {
+ result[i] *= multipliers[i];
+ }
+ return result;
+ }
+
+ @Override
+ public void refresh(Collection<Refreshable> alreadyRefreshed) {
+ cfSimilarity.refresh(alreadyRefreshed);
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/Track2Callable.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/Track2Callable.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/Track2Callable.java
new file mode 100644
index 0000000..50fd35e
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/Track2Callable.java
@@ -0,0 +1,106 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.kddcup.track2;
+
+import org.apache.mahout.cf.taste.common.NoSuchItemException;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.cf.taste.recommender.Recommender;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.TreeMap;
+import java.util.concurrent.Callable;
+import java.util.concurrent.atomic.AtomicInteger;
+
+final class Track2Callable implements Callable<UserResult> {
+
+ private static final Logger log = LoggerFactory.getLogger(Track2Callable.class);
+ private static final AtomicInteger COUNT = new AtomicInteger();
+
+ private final Recommender recommender;
+ private final PreferenceArray userTest;
+
+ Track2Callable(Recommender recommender, PreferenceArray userTest) {
+ this.recommender = recommender;
+ this.userTest = userTest;
+ }
+
+ @Override
+ public UserResult call() throws TasteException {
+
+ int testSize = userTest.length();
+ if (testSize != 6) {
+ throw new IllegalArgumentException("Expecting 6 items for user but got " + userTest);
+ }
+ long userID = userTest.get(0).getUserID();
+ TreeMap<Double,Long> estimateToItemID = new TreeMap<>(Collections.reverseOrder());
+
+ for (int i = 0; i < testSize; i++) {
+ long itemID = userTest.getItemID(i);
+ double estimate;
+ try {
+ estimate = recommender.estimatePreference(userID, itemID);
+ } catch (NoSuchItemException nsie) {
+ // OK in the sample data provided before the contest, should never happen otherwise
+ log.warn("Unknown item {}; OK unless this is the real contest data", itemID);
+ continue;
+ }
+
+ if (!Double.isNaN(estimate)) {
+ estimateToItemID.put(estimate, itemID);
+ }
+ }
+
+ Collection<Long> itemIDs = estimateToItemID.values();
+ List<Long> topThree = new ArrayList<>(itemIDs);
+ if (topThree.size() > 3) {
+ topThree = topThree.subList(0, 3);
+ } else if (topThree.size() < 3) {
+ log.warn("Unable to recommend three items for {}", userID);
+ // Some NaNs - just guess at the rest then
+ Collection<Long> newItemIDs = new HashSet<>(3);
+ newItemIDs.addAll(itemIDs);
+ int i = 0;
+ while (i < testSize && newItemIDs.size() < 3) {
+ newItemIDs.add(userTest.getItemID(i));
+ i++;
+ }
+ topThree = new ArrayList<>(newItemIDs);
+ }
+ if (topThree.size() != 3) {
+ throw new IllegalStateException();
+ }
+
+ boolean[] result = new boolean[testSize];
+ for (int i = 0; i < testSize; i++) {
+ result[i] = topThree.contains(userTest.getItemID(i));
+ }
+
+ if (COUNT.incrementAndGet() % 1000 == 0) {
+ log.info("Completed {} users", COUNT.get());
+ }
+
+ return new UserResult(userID, result);
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/Track2Recommender.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/Track2Recommender.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/Track2Recommender.java
new file mode 100644
index 0000000..185a00d
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/Track2Recommender.java
@@ -0,0 +1,100 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.kddcup.track2;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Collection;
+import java.util.List;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.recommender.GenericBooleanPrefItemBasedRecommender;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.recommender.IDRescorer;
+import org.apache.mahout.cf.taste.recommender.RecommendedItem;
+import org.apache.mahout.cf.taste.recommender.Recommender;
+import org.apache.mahout.cf.taste.similarity.ItemSimilarity;
+
+public final class Track2Recommender implements Recommender {
+
+ private final Recommender recommender;
+
+ public Track2Recommender(DataModel dataModel, File dataFileDirectory) throws TasteException {
+ // Change this to whatever you like!
+ ItemSimilarity similarity;
+ try {
+ similarity = new HybridSimilarity(dataModel, dataFileDirectory);
+ } catch (IOException ioe) {
+ throw new TasteException(ioe);
+ }
+ recommender = new GenericBooleanPrefItemBasedRecommender(dataModel, similarity);
+ }
+
+ @Override
+ public List<RecommendedItem> recommend(long userID, int howMany) throws TasteException {
+ return recommender.recommend(userID, howMany);
+ }
+
+ @Override
+ public List<RecommendedItem> recommend(long userID, int howMany, boolean includeKnownItems) throws TasteException {
+ return recommend(userID, howMany, null, includeKnownItems);
+ }
+
+ @Override
+ public List<RecommendedItem> recommend(long userID, int howMany, IDRescorer rescorer) throws TasteException {
+ return recommender.recommend(userID, howMany, rescorer, false);
+ }
+
+ @Override
+ public List<RecommendedItem> recommend(long userID, int howMany, IDRescorer rescorer, boolean includeKnownItems)
+ throws TasteException {
+ return recommender.recommend(userID, howMany, rescorer, includeKnownItems);
+ }
+
+ @Override
+ public float estimatePreference(long userID, long itemID) throws TasteException {
+ return recommender.estimatePreference(userID, itemID);
+ }
+
+ @Override
+ public void setPreference(long userID, long itemID, float value) throws TasteException {
+ recommender.setPreference(userID, itemID, value);
+ }
+
+ @Override
+ public void removePreference(long userID, long itemID) throws TasteException {
+ recommender.removePreference(userID, itemID);
+ }
+
+ @Override
+ public DataModel getDataModel() {
+ return recommender.getDataModel();
+ }
+
+ @Override
+ public void refresh(Collection<Refreshable> alreadyRefreshed) {
+ recommender.refresh(alreadyRefreshed);
+ }
+
+ @Override
+ public String toString() {
+ return "Track1Recommender[recommender:" + recommender + ']';
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/Track2RecommenderBuilder.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/Track2RecommenderBuilder.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/Track2RecommenderBuilder.java
new file mode 100644
index 0000000..09ade5d
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/Track2RecommenderBuilder.java
@@ -0,0 +1,33 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.kddcup.track2;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.eval.RecommenderBuilder;
+import org.apache.mahout.cf.taste.example.kddcup.KDDCupDataModel;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.recommender.Recommender;
+
+final class Track2RecommenderBuilder implements RecommenderBuilder {
+
+ @Override
+ public Recommender buildRecommender(DataModel dataModel) throws TasteException {
+ return new Track2Recommender(dataModel, ((KDDCupDataModel) dataModel).getDataFileDirectory());
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/Track2Runner.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/Track2Runner.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/Track2Runner.java
new file mode 100644
index 0000000..3cbb61c
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/Track2Runner.java
@@ -0,0 +1,100 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.kddcup.track2;
+
+import org.apache.mahout.cf.taste.example.kddcup.DataFileIterable;
+import org.apache.mahout.cf.taste.example.kddcup.KDDCupDataModel;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.common.Pair;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.BufferedOutputStream;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.OutputStream;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+
+/**
+ * <p>Runs "track 2" of the KDD Cup competition using whatever recommender is inside {@link Track2Recommender}
+ * and attempts to output the result in the correct contest format.</p>
+ *
+ * <p>Run as: {@code Track2Runner [track 2 data file directory] [output file]}</p>
+ */
+public final class Track2Runner {
+
+ private static final Logger log = LoggerFactory.getLogger(Track2Runner.class);
+
+ private Track2Runner() {
+ }
+
+ public static void main(String[] args) throws Exception {
+
+ File dataFileDirectory = new File(args[0]);
+ if (!dataFileDirectory.exists() || !dataFileDirectory.isDirectory()) {
+ throw new IllegalArgumentException("Bad data file directory: " + dataFileDirectory);
+ }
+
+ long start = System.currentTimeMillis();
+
+ KDDCupDataModel model = new KDDCupDataModel(KDDCupDataModel.getTrainingFile(dataFileDirectory));
+ Track2Recommender recommender = new Track2Recommender(model, dataFileDirectory);
+
+ long end = System.currentTimeMillis();
+ log.info("Loaded model in {}s", (end - start) / 1000);
+ start = end;
+
+ Collection<Track2Callable> callables = new ArrayList<>();
+ for (Pair<PreferenceArray,long[]> tests : new DataFileIterable(KDDCupDataModel.getTestFile(dataFileDirectory))) {
+ PreferenceArray userTest = tests.getFirst();
+ callables.add(new Track2Callable(recommender, userTest));
+ }
+
+ int cores = Runtime.getRuntime().availableProcessors();
+ log.info("Running on {} cores", cores);
+ ExecutorService executor = Executors.newFixedThreadPool(cores);
+ List<Future<UserResult>> futures = executor.invokeAll(callables);
+ executor.shutdown();
+
+ end = System.currentTimeMillis();
+ log.info("Ran recommendations in {}s", (end - start) / 1000);
+ start = end;
+
+ try (OutputStream out = new BufferedOutputStream(new FileOutputStream(new File(args[1])))){
+ long lastUserID = Long.MIN_VALUE;
+ for (Future<UserResult> future : futures) {
+ UserResult result = future.get();
+ long userID = result.getUserID();
+ if (userID <= lastUserID) {
+ throw new IllegalStateException();
+ }
+ lastUserID = userID;
+ out.write(result.getResultBytes());
+ }
+ }
+
+ end = System.currentTimeMillis();
+ log.info("Wrote output in {}s", (end - start) / 1000);
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/TrackData.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/TrackData.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/TrackData.java
new file mode 100644
index 0000000..abd15f8
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/TrackData.java
@@ -0,0 +1,71 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.kddcup.track2;
+
+import java.util.regex.Pattern;
+
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+
+final class TrackData {
+
+ private static final Pattern PIPE = Pattern.compile("\\|");
+ private static final String NO_VALUE = "None";
+ static final long NO_VALUE_ID = Long.MIN_VALUE;
+ private static final FastIDSet NO_GENRES = new FastIDSet();
+
+ private final long trackID;
+ private final long albumID;
+ private final long artistID;
+ private final FastIDSet genreIDs;
+
+ TrackData(CharSequence line) {
+ String[] tokens = PIPE.split(line);
+ trackID = Long.parseLong(tokens[0]);
+ albumID = parse(tokens[1]);
+ artistID = parse(tokens[2]);
+ if (tokens.length > 3) {
+ genreIDs = new FastIDSet(tokens.length - 3);
+ for (int i = 3; i < tokens.length; i++) {
+ genreIDs.add(Long.parseLong(tokens[i]));
+ }
+ } else {
+ genreIDs = NO_GENRES;
+ }
+ }
+
+ private static long parse(String value) {
+ return NO_VALUE.equals(value) ? NO_VALUE_ID : Long.parseLong(value);
+ }
+
+ public long getTrackID() {
+ return trackID;
+ }
+
+ public long getAlbumID() {
+ return albumID;
+ }
+
+ public long getArtistID() {
+ return artistID;
+ }
+
+ public FastIDSet getGenreIDs() {
+ return genreIDs;
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/TrackItemSimilarity.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/TrackItemSimilarity.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/TrackItemSimilarity.java
new file mode 100644
index 0000000..3012a84
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/TrackItemSimilarity.java
@@ -0,0 +1,106 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.kddcup.track2;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Collection;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.example.kddcup.KDDCupDataModel;
+import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+import org.apache.mahout.cf.taste.similarity.ItemSimilarity;
+import org.apache.mahout.common.iterator.FileLineIterable;
+
+final class TrackItemSimilarity implements ItemSimilarity {
+
+ private final FastByIDMap<TrackData> trackData;
+
+ TrackItemSimilarity(File dataFileDirectory) throws IOException {
+ trackData = new FastByIDMap<>();
+ for (String line : new FileLineIterable(KDDCupDataModel.getTrackFile(dataFileDirectory))) {
+ TrackData trackDatum = new TrackData(line);
+ trackData.put(trackDatum.getTrackID(), trackDatum);
+ }
+ }
+
+ @Override
+ public double itemSimilarity(long itemID1, long itemID2) {
+ if (itemID1 == itemID2) {
+ return 1.0;
+ }
+ TrackData data1 = trackData.get(itemID1);
+ TrackData data2 = trackData.get(itemID2);
+ if (data1 == null || data2 == null) {
+ return 0.0;
+ }
+
+ // Arbitrarily decide that same album means "very similar"
+ if (data1.getAlbumID() != TrackData.NO_VALUE_ID && data1.getAlbumID() == data2.getAlbumID()) {
+ return 0.9;
+ }
+ // ... and same artist means "fairly similar"
+ if (data1.getArtistID() != TrackData.NO_VALUE_ID && data1.getArtistID() == data2.getArtistID()) {
+ return 0.7;
+ }
+
+ // Tanimoto coefficient similarity based on genre, but maximum value of 0.25
+ FastIDSet genres1 = data1.getGenreIDs();
+ FastIDSet genres2 = data2.getGenreIDs();
+ if (genres1 == null || genres2 == null) {
+ return 0.0;
+ }
+ int intersectionSize = genres1.intersectionSize(genres2);
+ if (intersectionSize == 0) {
+ return 0.0;
+ }
+ int unionSize = genres1.size() + genres2.size() - intersectionSize;
+ return intersectionSize / (4.0 * unionSize);
+ }
+
+ @Override
+ public double[] itemSimilarities(long itemID1, long[] itemID2s) {
+ int length = itemID2s.length;
+ double[] result = new double[length];
+ for (int i = 0; i < length; i++) {
+ result[i] = itemSimilarity(itemID1, itemID2s[i]);
+ }
+ return result;
+ }
+
+ @Override
+ public long[] allSimilarItemIDs(long itemID) {
+ FastIDSet allSimilarItemIDs = new FastIDSet();
+ LongPrimitiveIterator allItemIDs = trackData.keySetIterator();
+ while (allItemIDs.hasNext()) {
+ long possiblySimilarItemID = allItemIDs.nextLong();
+ if (!Double.isNaN(itemSimilarity(itemID, possiblySimilarItemID))) {
+ allSimilarItemIDs.add(possiblySimilarItemID);
+ }
+ }
+ return allSimilarItemIDs.toArray();
+ }
+
+ @Override
+ public void refresh(Collection<Refreshable> alreadyRefreshed) {
+ // do nothing
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/UserResult.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/UserResult.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/UserResult.java
new file mode 100644
index 0000000..e554d10
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/UserResult.java
@@ -0,0 +1,54 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.kddcup.track2;
+
+final class UserResult {
+
+ private final long userID;
+ private final byte[] resultBytes;
+
+ UserResult(long userID, boolean[] result) {
+
+ this.userID = userID;
+
+ int trueCount = 0;
+ for (boolean b : result) {
+ if (b) {
+ trueCount++;
+ }
+ }
+ if (trueCount != 3) {
+ throw new IllegalStateException();
+ }
+
+ resultBytes = new byte[result.length];
+ for (int i = 0; i < result.length; i++) {
+ resultBytes[i] = (byte) (result[i] ? '1' : '0');
+ }
+ }
+
+ public long getUserID() {
+ return userID;
+ }
+
+ public byte[] getResultBytes() {
+ return resultBytes;
+ }
+
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/hadoop/example/als/netflix/NetflixDatasetConverter.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/hadoop/example/als/netflix/NetflixDatasetConverter.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/hadoop/example/als/netflix/NetflixDatasetConverter.java
new file mode 100644
index 0000000..22f122e
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/hadoop/example/als/netflix/NetflixDatasetConverter.java
@@ -0,0 +1,140 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.example.als.netflix;
+
+import com.google.common.base.Preconditions;
+import org.apache.commons.io.Charsets;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.mahout.cf.taste.impl.model.GenericPreference;
+import org.apache.mahout.cf.taste.model.Preference;
+import org.apache.mahout.common.iterator.FileLineIterable;
+import org.apache.mahout.common.iterator.FileLineIterator;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Pattern;
+
+/** converts the raw files provided by netflix to an appropriate input format */
+public final class NetflixDatasetConverter {
+
+ private static final Logger log = LoggerFactory.getLogger(NetflixDatasetConverter.class);
+
+ private static final Pattern SEPARATOR = Pattern.compile(",");
+ private static final String MOVIE_DENOTER = ":";
+ private static final String TAB = "\t";
+ private static final String NEWLINE = "\n";
+
+ private NetflixDatasetConverter() {
+ }
+
+ public static void main(String[] args) throws IOException {
+
+ if (args.length != 4) {
+ System.err.println("Usage: NetflixDatasetConverter /path/to/training_set/ /path/to/qualifying.txt "
+ + "/path/to/judging.txt /path/to/destination");
+ return;
+ }
+
+ String trainingDataDir = args[0];
+ String qualifyingTxt = args[1];
+ String judgingTxt = args[2];
+ Path outputPath = new Path(args[3]);
+
+ Configuration conf = new Configuration();
+ FileSystem fs = FileSystem.get(outputPath.toUri(), conf);
+
+ Preconditions.checkArgument(trainingDataDir != null, "Training Data location needs to be specified");
+ log.info("Creating training set at {}/trainingSet/ratings.tsv ...", outputPath);
+ try (BufferedWriter writer =
+ new BufferedWriter(
+ new OutputStreamWriter(
+ fs.create(new Path(outputPath, "trainingSet/ratings.tsv")), Charsets.UTF_8))){
+
+ int ratingsProcessed = 0;
+ for (File movieRatings : new File(trainingDataDir).listFiles()) {
+ try (FileLineIterator lines = new FileLineIterator(movieRatings)) {
+ boolean firstLineRead = false;
+ String movieID = null;
+ while (lines.hasNext()) {
+ String line = lines.next();
+ if (firstLineRead) {
+ String[] tokens = SEPARATOR.split(line);
+ String userID = tokens[0];
+ String rating = tokens[1];
+ writer.write(userID + TAB + movieID + TAB + rating + NEWLINE);
+ ratingsProcessed++;
+ if (ratingsProcessed % 1000000 == 0) {
+ log.info("{} ratings processed...", ratingsProcessed);
+ }
+ } else {
+ movieID = line.replaceAll(MOVIE_DENOTER, "");
+ firstLineRead = true;
+ }
+ }
+ }
+
+ }
+ log.info("{} ratings processed. done.", ratingsProcessed);
+ }
+
+ log.info("Reading probes...");
+ List<Preference> probes = new ArrayList<>(2817131);
+ long currentMovieID = -1;
+ for (String line : new FileLineIterable(new File(qualifyingTxt))) {
+ if (line.contains(MOVIE_DENOTER)) {
+ currentMovieID = Long.parseLong(line.replaceAll(MOVIE_DENOTER, ""));
+ } else {
+ long userID = Long.parseLong(SEPARATOR.split(line)[0]);
+ probes.add(new GenericPreference(userID, currentMovieID, 0));
+ }
+ }
+ log.info("{} probes read...", probes.size());
+
+ log.info("Reading ratings, creating probe set at {}/probeSet/ratings.tsv ...", outputPath);
+ try (BufferedWriter writer =
+ new BufferedWriter(new OutputStreamWriter(
+ fs.create(new Path(outputPath, "probeSet/ratings.tsv")), Charsets.UTF_8))){
+ int ratingsProcessed = 0;
+ for (String line : new FileLineIterable(new File(judgingTxt))) {
+ if (line.contains(MOVIE_DENOTER)) {
+ currentMovieID = Long.parseLong(line.replaceAll(MOVIE_DENOTER, ""));
+ } else {
+ float rating = Float.parseFloat(SEPARATOR.split(line)[0]);
+ Preference pref = probes.get(ratingsProcessed);
+ Preconditions.checkState(pref.getItemID() == currentMovieID);
+ ratingsProcessed++;
+ writer.write(pref.getUserID() + TAB + pref.getItemID() + TAB + rating + NEWLINE);
+ if (ratingsProcessed % 1000000 == 0) {
+ log.info("{} ratings processed...", ratingsProcessed);
+ }
+ }
+ }
+ log.info("{} ratings processed. done.", ratingsProcessed);
+ }
+ }
+
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/similarity/precompute/example/BatchItemSimilaritiesGroupLens.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/similarity/precompute/example/BatchItemSimilaritiesGroupLens.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/similarity/precompute/example/BatchItemSimilaritiesGroupLens.java
new file mode 100644
index 0000000..8021d00
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/similarity/precompute/example/BatchItemSimilaritiesGroupLens.java
@@ -0,0 +1,65 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.similarity.precompute.example;
+
+import org.apache.mahout.cf.taste.impl.recommender.GenericItemBasedRecommender;
+import org.apache.mahout.cf.taste.impl.similarity.LogLikelihoodSimilarity;
+import org.apache.mahout.cf.taste.impl.similarity.precompute.FileSimilarItemsWriter;
+import org.apache.mahout.cf.taste.impl.similarity.precompute.MultithreadedBatchItemSimilarities;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.recommender.ItemBasedRecommender;
+import org.apache.mahout.cf.taste.similarity.precompute.BatchItemSimilarities;
+
+import java.io.File;
+
+/**
+ * Example that precomputes all item similarities of the Movielens1M dataset
+ *
+ * Usage: download movielens1M from http://www.grouplens.org/node/73 , unzip it and invoke this code with the path
+ * to the ratings.dat file as argument
+ *
+ */
+public final class BatchItemSimilaritiesGroupLens {
+
+ private BatchItemSimilaritiesGroupLens() {}
+
+ public static void main(String[] args) throws Exception {
+
+ if (args.length != 1) {
+ System.err.println("Need path to ratings.dat of the movielens1M dataset as argument!");
+ System.exit(-1);
+ }
+
+ File resultFile = new File(System.getProperty("java.io.tmpdir"), "similarities.csv");
+ if (resultFile.exists()) {
+ resultFile.delete();
+ }
+
+ DataModel dataModel = new GroupLensDataModel(new File(args[0]));
+ ItemBasedRecommender recommender = new GenericItemBasedRecommender(dataModel,
+ new LogLikelihoodSimilarity(dataModel));
+ BatchItemSimilarities batch = new MultithreadedBatchItemSimilarities(recommender, 5);
+
+ int numSimilarities = batch.computeItemSimilarities(Runtime.getRuntime().availableProcessors(), 1,
+ new FileSimilarItemsWriter(resultFile));
+
+ System.out.println("Computed " + numSimilarities + " similarities for " + dataModel.getNumItems() + " items "
+ + "and saved them to " + resultFile.getAbsolutePath());
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/similarity/precompute/example/GroupLensDataModel.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/similarity/precompute/example/GroupLensDataModel.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/similarity/precompute/example/GroupLensDataModel.java
new file mode 100644
index 0000000..7ee9b17
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/similarity/precompute/example/GroupLensDataModel.java
@@ -0,0 +1,96 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.similarity.precompute.example;
+
+import com.google.common.io.Files;
+import com.google.common.io.InputSupplier;
+import com.google.common.io.Resources;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.net.URL;
+import java.util.regex.Pattern;
+import org.apache.commons.io.Charsets;
+import org.apache.mahout.cf.taste.impl.model.file.FileDataModel;
+import org.apache.mahout.common.iterator.FileLineIterable;
+
+public final class GroupLensDataModel extends FileDataModel {
+
+ private static final String COLON_DELIMTER = "::";
+ private static final Pattern COLON_DELIMITER_PATTERN = Pattern.compile(COLON_DELIMTER);
+
+ public GroupLensDataModel() throws IOException {
+ this(readResourceToTempFile("/org/apache/mahout/cf/taste/example/grouplens/ratings.dat"));
+ }
+
+ /**
+ * @param ratingsFile GroupLens ratings.dat file in its native format
+ * @throws IOException if an error occurs while reading or writing files
+ */
+ public GroupLensDataModel(File ratingsFile) throws IOException {
+ super(convertGLFile(ratingsFile));
+ }
+
+ private static File convertGLFile(File originalFile) throws IOException {
+ // Now translate the file; remove commas, then convert "::" delimiter to comma
+ File resultFile = new File(new File(System.getProperty("java.io.tmpdir")), "ratings.txt");
+ if (resultFile.exists()) {
+ resultFile.delete();
+ }
+ try (Writer writer = new OutputStreamWriter(new FileOutputStream(resultFile), Charsets.UTF_8)){
+ for (String line : new FileLineIterable(originalFile, false)) {
+ int lastDelimiterStart = line.lastIndexOf(COLON_DELIMTER);
+ if (lastDelimiterStart < 0) {
+ throw new IOException("Unexpected input format on line: " + line);
+ }
+ String subLine = line.substring(0, lastDelimiterStart);
+ String convertedLine = COLON_DELIMITER_PATTERN.matcher(subLine).replaceAll(",");
+ writer.write(convertedLine);
+ writer.write('\n');
+ }
+ } catch (IOException ioe) {
+ resultFile.delete();
+ throw ioe;
+ }
+ return resultFile;
+ }
+
+ public static File readResourceToTempFile(String resourceName) throws IOException {
+ InputSupplier<? extends InputStream> inSupplier;
+ try {
+ URL resourceURL = Resources.getResource(GroupLensDataModel.class, resourceName);
+ inSupplier = Resources.newInputStreamSupplier(resourceURL);
+ } catch (IllegalArgumentException iae) {
+ File resourceFile = new File("src/main/java" + resourceName);
+ inSupplier = Files.newInputStreamSupplier(resourceFile);
+ }
+ File tempFile = File.createTempFile("taste", null);
+ tempFile.deleteOnExit();
+ Files.copy(inSupplier, tempFile);
+ return tempFile;
+ }
+
+ @Override
+ public String toString() {
+ return "GroupLensDataModel";
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/NewsgroupHelper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/NewsgroupHelper.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/NewsgroupHelper.java
new file mode 100644
index 0000000..5cec51c
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/NewsgroupHelper.java
@@ -0,0 +1,128 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier;
+
+import com.google.common.collect.ConcurrentHashMultiset;
+import com.google.common.collect.Multiset;
+import com.google.common.io.Closeables;
+import com.google.common.io.Files;
+import org.apache.commons.io.Charsets;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.math.RandomAccessSparseVector;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.vectorizer.encoders.ConstantValueEncoder;
+import org.apache.mahout.vectorizer.encoders.FeatureVectorEncoder;
+import org.apache.mahout.vectorizer.encoders.StaticWordValueEncoder;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+import java.text.SimpleDateFormat;
+import java.util.Collection;
+import java.util.Date;
+import java.util.Locale;
+import java.util.Random;
+
+public final class NewsgroupHelper {
+
+ private static final SimpleDateFormat[] DATE_FORMATS = {
+ new SimpleDateFormat("", Locale.ENGLISH),
+ new SimpleDateFormat("MMM-yyyy", Locale.ENGLISH),
+ new SimpleDateFormat("dd-MMM-yyyy HH:mm:ss", Locale.ENGLISH)
+ };
+
+ public static final int FEATURES = 10000;
+ // 1997-01-15 00:01:00 GMT
+ private static final long DATE_REFERENCE = 853286460;
+ private static final long MONTH = 30 * 24 * 3600;
+ private static final long WEEK = 7 * 24 * 3600;
+
+ private final Random rand = RandomUtils.getRandom();
+ private final Analyzer analyzer = new StandardAnalyzer();
+ private final FeatureVectorEncoder encoder = new StaticWordValueEncoder("body");
+ private final FeatureVectorEncoder bias = new ConstantValueEncoder("Intercept");
+
+ public FeatureVectorEncoder getEncoder() {
+ return encoder;
+ }
+
+ public FeatureVectorEncoder getBias() {
+ return bias;
+ }
+
+ public Random getRandom() {
+ return rand;
+ }
+
+ public Vector encodeFeatureVector(File file, int actual, int leakType, Multiset<String> overallCounts)
+ throws IOException {
+ long date = (long) (1000 * (DATE_REFERENCE + actual * MONTH + 1 * WEEK * rand.nextDouble()));
+ Multiset<String> words = ConcurrentHashMultiset.create();
+
+ try (BufferedReader reader = Files.newReader(file, Charsets.UTF_8)) {
+ String line = reader.readLine();
+ Reader dateString = new StringReader(DATE_FORMATS[leakType % 3].format(new Date(date)));
+ countWords(analyzer, words, dateString, overallCounts);
+ while (line != null && !line.isEmpty()) {
+ boolean countHeader = (
+ line.startsWith("From:") || line.startsWith("Subject:")
+ || line.startsWith("Keywords:") || line.startsWith("Summary:")) && leakType < 6;
+ do {
+ Reader in = new StringReader(line);
+ if (countHeader) {
+ countWords(analyzer, words, in, overallCounts);
+ }
+ line = reader.readLine();
+ } while (line != null && line.startsWith(" "));
+ }
+ if (leakType < 3) {
+ countWords(analyzer, words, reader, overallCounts);
+ }
+ }
+
+ Vector v = new RandomAccessSparseVector(FEATURES);
+ bias.addToVector("", 1, v);
+ for (String word : words.elementSet()) {
+ encoder.addToVector(word, Math.log1p(words.count(word)), v);
+ }
+
+ return v;
+ }
+
+ public static void countWords(Analyzer analyzer,
+ Collection<String> words,
+ Reader in,
+ Multiset<String> overallCounts) throws IOException {
+ TokenStream ts = analyzer.tokenStream("text", in);
+ ts.addAttribute(CharTermAttribute.class);
+ ts.reset();
+ while (ts.incrementToken()) {
+ String s = ts.getAttribute(CharTermAttribute.class).toString();
+ words.add(s);
+ }
+ overallCounts.addAll(words);
+ ts.end();
+ Closeables.close(ts, true);
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/email/PrepEmailMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/email/PrepEmailMapper.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/email/PrepEmailMapper.java
new file mode 100644
index 0000000..16e9d80
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/email/PrepEmailMapper.java
@@ -0,0 +1,65 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.email;
+
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.WritableComparable;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.math.VectorWritable;
+
+import java.io.IOException;
+import java.util.Locale;
+import java.util.regex.Pattern;
+
+/**
+ * Convert the labels created by the {@link org.apache.mahout.utils.email.MailProcessor} to one consumable
+ * by the classifiers
+ */
+public class PrepEmailMapper extends Mapper<WritableComparable<?>, VectorWritable, Text, VectorWritable> {
+
+ private static final Pattern DASH_DOT = Pattern.compile("-|\\.");
+ private static final Pattern SLASH = Pattern.compile("\\/");
+
+ private boolean useListName = false; //if true, use the project name and the list name in label creation
+ @Override
+ protected void setup(Context context) throws IOException, InterruptedException {
+ useListName = Boolean.parseBoolean(context.getConfiguration().get(PrepEmailVectorsDriver.USE_LIST_NAME));
+ }
+
+ @Override
+ protected void map(WritableComparable<?> key, VectorWritable value, Context context)
+ throws IOException, InterruptedException {
+ String input = key.toString();
+ ///Example: /cocoon.apache.org/dev/200307.gz/001401c3414f$8394e160$***@WRPO
+ String[] splits = SLASH.split(input);
+ //we need the first two splits;
+ if (splits.length >= 3) {
+ StringBuilder bldr = new StringBuilder();
+ bldr.append(escape(splits[1]));
+ if (useListName) {
+ bldr.append('_').append(escape(splits[2]));
+ }
+ context.write(new Text(bldr.toString()), value);
+ }
+
+ }
+
+ private static String escape(CharSequence value) {
+ return DASH_DOT.matcher(value).replaceAll("_").toLowerCase(Locale.ENGLISH);
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/email/PrepEmailReducer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/email/PrepEmailReducer.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/email/PrepEmailReducer.java
new file mode 100644
index 0000000..da6e613
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/email/PrepEmailReducer.java
@@ -0,0 +1,47 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.email;
+
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.mahout.math.VectorWritable;
+
+import java.io.IOException;
+import java.util.Iterator;
+
+public class PrepEmailReducer extends Reducer<Text, VectorWritable, Text, VectorWritable> {
+
+ private long maxItemsPerLabel = 10000;
+
+ @Override
+ protected void setup(Context context) throws IOException, InterruptedException {
+ maxItemsPerLabel = Long.parseLong(context.getConfiguration().get(PrepEmailVectorsDriver.ITEMS_PER_CLASS));
+ }
+
+ @Override
+ protected void reduce(Text key, Iterable<VectorWritable> values, Context context)
+ throws IOException, InterruptedException {
+ //TODO: support randomization? Likely not needed due to the SplitInput utility which does random selection
+ long i = 0;
+ Iterator<VectorWritable> iterator = values.iterator();
+ while (i < maxItemsPerLabel && iterator.hasNext()) {
+ context.write(key, iterator.next());
+ i++;
+ }
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/email/PrepEmailVectorsDriver.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/email/PrepEmailVectorsDriver.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/email/PrepEmailVectorsDriver.java
new file mode 100644
index 0000000..8fba739
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/email/PrepEmailVectorsDriver.java
@@ -0,0 +1,76 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.email;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.math.VectorWritable;
+
+import java.util.List;
+import java.util.Map;
+
+/**
+ * Convert the labels generated by {@link org.apache.mahout.text.SequenceFilesFromMailArchives} and
+ * {@link org.apache.mahout.vectorizer.SparseVectorsFromSequenceFiles} to ones consumable by the classifiers. We do this
+ * here b/c if it is done in the creation of sparse vectors, the Reducer collapses all the vectors.
+ */
+public class PrepEmailVectorsDriver extends AbstractJob {
+
+ public static final String ITEMS_PER_CLASS = "itemsPerClass";
+ public static final String USE_LIST_NAME = "USE_LIST_NAME";
+
+ public static void main(String[] args) throws Exception {
+ ToolRunner.run(new Configuration(), new PrepEmailVectorsDriver(), args);
+ }
+
+ @Override
+ public int run(String[] args) throws Exception {
+ addInputOption();
+ addOutputOption();
+ addOption(DefaultOptionCreator.overwriteOption().create());
+ addOption("maxItemsPerLabel", "mipl", "The maximum number of items per label. Can be useful for making the "
+ + "training sets the same size", String.valueOf(100000));
+ addOption(buildOption("useListName", "ul", "Use the name of the list as part of the label. If not set, then "
+ + "just use the project name", false, false, "false"));
+ Map<String,List<String>> parsedArgs = parseArguments(args);
+ if (parsedArgs == null) {
+ return -1;
+ }
+
+ Path input = getInputPath();
+ Path output = getOutputPath();
+ if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
+ HadoopUtil.delete(getConf(), output);
+ }
+ Job convertJob = prepareJob(input, output, SequenceFileInputFormat.class, PrepEmailMapper.class, Text.class,
+ VectorWritable.class, PrepEmailReducer.class, Text.class, VectorWritable.class, SequenceFileOutputFormat.class);
+ convertJob.getConfiguration().set(ITEMS_PER_CLASS, getOption("maxItemsPerLabel"));
+ convertJob.getConfiguration().set(USE_LIST_NAME, String.valueOf(hasOption("useListName")));
+
+ boolean succeeded = convertJob.waitForCompletion(true);
+ return succeeded ? 0 : -1;
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/PosTagger.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/PosTagger.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/PosTagger.java
new file mode 100644
index 0000000..9c0ef56
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/PosTagger.java
@@ -0,0 +1,277 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sequencelearning.hmm;
+
+import com.google.common.io.Resources;
+import org.apache.commons.io.Charsets;
+import org.apache.mahout.math.Matrix;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.net.URL;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Pattern;
+
+/**
+ * This class implements a sample program that uses a pre-tagged training data
+ * set to train an HMM model as a POS tagger. The training data is automatically
+ * downloaded from the following URL:
+ * http://www.jaist.ac.jp/~hieuxuan/flexcrfs/CoNLL2000-NP/train.txt It then
+ * trains an HMM Model using supervised learning and tests the model on the
+ * following test data set:
+ * http://www.jaist.ac.jp/~hieuxuan/flexcrfs/CoNLL2000-NP/test.txt Further
+ * details regarding the data files can be found at
+ * http://flexcrfs.sourceforge.net/#Case_Study
+ */
+public final class PosTagger {
+
+ private static final Logger log = LoggerFactory.getLogger(PosTagger.class);
+
+ private static final Pattern SPACE = Pattern.compile(" ");
+ private static final Pattern SPACES = Pattern.compile("[ ]+");
+
+ /**
+ * No public constructors for utility classes.
+ */
+ private PosTagger() {
+ // nothing to do here really.
+ }
+
+ /**
+ * Model trained in the example.
+ */
+ private static HmmModel taggingModel;
+
+ /**
+ * Map for storing the IDs for the POS tags (hidden states)
+ */
+ private static Map<String, Integer> tagIDs;
+
+ /**
+ * Counter for the next assigned POS tag ID The value of 0 is reserved for
+ * "unknown POS tag"
+ */
+ private static int nextTagId;
+
+ /**
+ * Map for storing the IDs for observed words (observed states)
+ */
+ private static Map<String, Integer> wordIDs;
+
+ /**
+ * Counter for the next assigned word ID The value of 0 is reserved for
+ * "unknown word"
+ */
+ private static int nextWordId = 1; // 0 is reserved for "unknown word"
+
+ /**
+ * Used for storing a list of POS tags of read sentences.
+ */
+ private static List<int[]> hiddenSequences;
+
+ /**
+ * Used for storing a list of word tags of read sentences.
+ */
+ private static List<int[]> observedSequences;
+
+ /**
+ * number of read lines
+ */
+ private static int readLines;
+
+ /**
+ * Given an URL, this function fetches the data file, parses it, assigns POS
+ * Tag/word IDs and fills the hiddenSequences/observedSequences lists with
+ * data from those files. The data is expected to be in the following format
+ * (one word per line): word pos-tag np-tag sentences are closed with the .
+ * pos tag
+ *
+ * @param url Where the data file is stored
+ * @param assignIDs Should IDs for unknown words/tags be assigned? (Needed for
+ * training data, not needed for test data)
+ * @throws IOException in case data file cannot be read.
+ */
+ private static void readFromURL(String url, boolean assignIDs) throws IOException {
+ // initialize the data structure
+ hiddenSequences = new LinkedList<>();
+ observedSequences = new LinkedList<>();
+ readLines = 0;
+
+ // now read line by line of the input file
+ List<Integer> observedSequence = new LinkedList<>();
+ List<Integer> hiddenSequence = new LinkedList<>();
+
+ for (String line :Resources.readLines(new URL(url), Charsets.UTF_8)) {
+ if (line.isEmpty()) {
+ // new sentence starts
+ int[] observedSequenceArray = new int[observedSequence.size()];
+ int[] hiddenSequenceArray = new int[hiddenSequence.size()];
+ for (int i = 0; i < observedSequence.size(); ++i) {
+ observedSequenceArray[i] = observedSequence.get(i);
+ hiddenSequenceArray[i] = hiddenSequence.get(i);
+ }
+ // now register those arrays
+ hiddenSequences.add(hiddenSequenceArray);
+ observedSequences.add(observedSequenceArray);
+ // and reset the linked lists
+ observedSequence.clear();
+ hiddenSequence.clear();
+ continue;
+ }
+ readLines++;
+ // we expect the format [word] [POS tag] [NP tag]
+ String[] tags = SPACE.split(line);
+ // when analyzing the training set, assign IDs
+ if (assignIDs) {
+ if (!wordIDs.containsKey(tags[0])) {
+ wordIDs.put(tags[0], nextWordId++);
+ }
+ if (!tagIDs.containsKey(tags[1])) {
+ tagIDs.put(tags[1], nextTagId++);
+ }
+ }
+ // determine the IDs
+ Integer wordID = wordIDs.get(tags[0]);
+ Integer tagID = tagIDs.get(tags[1]);
+ // now construct the current sequence
+ if (wordID == null) {
+ observedSequence.add(0);
+ } else {
+ observedSequence.add(wordID);
+ }
+
+ if (tagID == null) {
+ hiddenSequence.add(0);
+ } else {
+ hiddenSequence.add(tagID);
+ }
+ }
+
+ // if there is still something in the pipe, register it
+ if (!observedSequence.isEmpty()) {
+ int[] observedSequenceArray = new int[observedSequence.size()];
+ int[] hiddenSequenceArray = new int[hiddenSequence.size()];
+ for (int i = 0; i < observedSequence.size(); ++i) {
+ observedSequenceArray[i] = observedSequence.get(i);
+ hiddenSequenceArray[i] = hiddenSequence.get(i);
+ }
+ // now register those arrays
+ hiddenSequences.add(hiddenSequenceArray);
+ observedSequences.add(observedSequenceArray);
+ }
+ }
+
+ private static void trainModel(String trainingURL) throws IOException {
+ tagIDs = new HashMap<>(44); // we expect 44 distinct tags
+ wordIDs = new HashMap<>(19122); // we expect 19122
+ // distinct words
+ log.info("Reading and parsing training data file from URL: {}", trainingURL);
+ long start = System.currentTimeMillis();
+ readFromURL(trainingURL, true);
+ long end = System.currentTimeMillis();
+ double duration = (end - start) / 1000.0;
+ log.info("Parsing done in {} seconds!", duration);
+ log.info("Read {} lines containing {} sentences with a total of {} distinct words and {} distinct POS tags.",
+ readLines, hiddenSequences.size(), nextWordId - 1, nextTagId - 1);
+ start = System.currentTimeMillis();
+ taggingModel = HmmTrainer.trainSupervisedSequence(nextTagId, nextWordId,
+ hiddenSequences, observedSequences, 0.05);
+ // we have to adjust the model a bit,
+ // since we assume a higher probability that a given unknown word is NNP
+ // than anything else
+ Matrix emissions = taggingModel.getEmissionMatrix();
+ for (int i = 0; i < taggingModel.getNrOfHiddenStates(); ++i) {
+ emissions.setQuick(i, 0, 0.1 / taggingModel.getNrOfHiddenStates());
+ }
+ int nnptag = tagIDs.get("NNP");
+ emissions.setQuick(nnptag, 0, 1 / (double) taggingModel.getNrOfHiddenStates());
+ // re-normalize the emission probabilities
+ HmmUtils.normalizeModel(taggingModel);
+ // now register the names
+ taggingModel.registerHiddenStateNames(tagIDs);
+ taggingModel.registerOutputStateNames(wordIDs);
+ end = System.currentTimeMillis();
+ duration = (end - start) / 1000.0;
+ log.info("Trained HMM models in {} seconds!", duration);
+ }
+
+ private static void testModel(String testingURL) throws IOException {
+ log.info("Reading and parsing test data file from URL: {}", testingURL);
+ long start = System.currentTimeMillis();
+ readFromURL(testingURL, false);
+ long end = System.currentTimeMillis();
+ double duration = (end - start) / 1000.0;
+ log.info("Parsing done in {} seconds!", duration);
+ log.info("Read {} lines containing {} sentences.", readLines, hiddenSequences.size());
+
+ start = System.currentTimeMillis();
+ int errorCount = 0;
+ int totalCount = 0;
+ for (int i = 0; i < observedSequences.size(); ++i) {
+ // fetch the viterbi path as the POS tag for this observed sequence
+ int[] posEstimate = HmmEvaluator.decode(taggingModel, observedSequences.get(i), false);
+ // compare with the expected
+ int[] posExpected = hiddenSequences.get(i);
+ for (int j = 0; j < posExpected.length; ++j) {
+ totalCount++;
+ if (posEstimate[j] != posExpected[j]) {
+ errorCount++;
+ }
+ }
+ }
+ end = System.currentTimeMillis();
+ duration = (end - start) / 1000.0;
+ log.info("POS tagged test file in {} seconds!", duration);
+ double errorRate = (double) errorCount / totalCount;
+ log.info("Tagged the test file with an error rate of: {}", errorRate);
+ }
+
+ private static List<String> tagSentence(String sentence) {
+ // first, we need to isolate all punctuation characters, so that they
+ // can be recognized
+ sentence = sentence.replaceAll("[,.!?:;\"]", " $0 ");
+ sentence = sentence.replaceAll("''", " '' ");
+ // now we tokenize the sentence
+ String[] tokens = SPACES.split(sentence);
+ // now generate the observed sequence
+ int[] observedSequence = HmmUtils.encodeStateSequence(taggingModel, Arrays.asList(tokens), true, 0);
+ // POS tag this observedSequence
+ int[] hiddenSequence = HmmEvaluator.decode(taggingModel, observedSequence, false);
+ // and now decode the tag names
+ return HmmUtils.decodeStateSequence(taggingModel, hiddenSequence, false, null);
+ }
+
+ public static void main(String[] args) throws IOException {
+ // generate the model from URL
+ trainModel("http://www.jaist.ac.jp/~hieuxuan/flexcrfs/CoNLL2000-NP/train.txt");
+ testModel("http://www.jaist.ac.jp/~hieuxuan/flexcrfs/CoNLL2000-NP/test.txt");
+ // tag an exemplary sentence
+ String test = "McDonalds is a huge company with many employees .";
+ String[] testWords = SPACE.split(test);
+ List<String> posTags = tagSentence(test);
+ for (int i = 0; i < posTags.size(); ++i) {
+ log.info("{}[{}]", testWords[i], posTags.get(i));
+ }
+ }
+
+}
r***@apache.org
2018-06-28 14:54:49 UTC
Permalink
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/MemoryIDMigrator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/MemoryIDMigrator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/MemoryIDMigrator.java
new file mode 100644
index 0000000..3463ff5
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/MemoryIDMigrator.java
@@ -0,0 +1,55 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.model;
+
+import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
+import org.apache.mahout.cf.taste.model.UpdatableIDMigrator;
+
+/**
+ * Implementation which stores the reverse long-to-String mapping in memory.
+ */
+public final class MemoryIDMigrator extends AbstractIDMigrator implements UpdatableIDMigrator {
+
+ private final FastByIDMap<String> longToString;
+
+ public MemoryIDMigrator() {
+ this.longToString = new FastByIDMap<>(100);
+ }
+
+ @Override
+ public void storeMapping(long longID, String stringID) {
+ synchronized (longToString) {
+ longToString.put(longID, stringID);
+ }
+ }
+
+ @Override
+ public String toStringID(long longID) {
+ synchronized (longToString) {
+ return longToString.get(longID);
+ }
+ }
+
+ @Override
+ public void initialize(Iterable<String> stringIDs) {
+ for (String stringID : stringIDs) {
+ storeMapping(toLongID(stringID), stringID);
+ }
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/MySQLJDBCIDMigrator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/MySQLJDBCIDMigrator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/MySQLJDBCIDMigrator.java
new file mode 100644
index 0000000..b134598
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/MySQLJDBCIDMigrator.java
@@ -0,0 +1,67 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.model;
+
+import javax.sql.DataSource;
+
+/**
+ * <p>
+ * An implementation for MySQL. The following statement would create a table suitable for use with this class:
+ * </p>
+ *
+ * <p>
+ *
+ * <pre>
+ * CREATE TABLE taste_id_migration (
+ * long_id BIGINT NOT NULL PRIMARY KEY,
+ * string_id VARCHAR(255) NOT NULL UNIQUE
+ * )
+ * </pre>
+ *
+ * </p>
+ *
+ * <p>
+ * Separately, note that in a MySQL database, the following function calls will convert a string value into a
+ * numeric value in the same way that the standard implementations in this package do. This may be useful in
+ * writing SQL statements for use with
+ * {@code AbstractJDBCDataModel} subclasses which convert string
+ * column values to appropriate numeric values -- though this should be viewed as a temporary arrangement
+ * since it will impact performance:
+ * </p>
+ *
+ * <p>
+ * {@code cast(conv(substring(md5([column name]), 1, 16),16,10) as signed)}
+ * </p>
+ */
+public final class MySQLJDBCIDMigrator extends AbstractJDBCIDMigrator {
+
+ public MySQLJDBCIDMigrator(DataSource dataSource) {
+ this(dataSource, DEFAULT_MAPPING_TABLE,
+ DEFAULT_LONG_ID_COLUMN, DEFAULT_STRING_ID_COLUMN);
+ }
+
+ public MySQLJDBCIDMigrator(DataSource dataSource,
+ String mappingTable,
+ String longIDColumn,
+ String stringIDColumn) {
+ super(dataSource,
+ "SELECT " + stringIDColumn + " FROM " + mappingTable + " WHERE " + longIDColumn + "=?",
+ "INSERT IGNORE INTO " + mappingTable + " (" + longIDColumn + ',' + stringIDColumn + ") VALUES (?,?)");
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/PlusAnonymousConcurrentUserDataModel.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/PlusAnonymousConcurrentUserDataModel.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/PlusAnonymousConcurrentUserDataModel.java
new file mode 100644
index 0000000..c97a545
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/PlusAnonymousConcurrentUserDataModel.java
@@ -0,0 +1,352 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.model;
+
+import com.google.common.base.Preconditions;
+import java.util.List;
+import java.util.Map;
+import java.util.Queue;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ConcurrentLinkedQueue;
+
+import com.google.common.collect.Lists;
+import org.apache.mahout.cf.taste.common.NoSuchItemException;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.Preference;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * <p>
+ * This is a special thread-safe version of {@link PlusAnonymousUserDataModel}
+ * which allow multiple concurrent anonymous requests.
+ * </p>
+ *
+ * <p>
+ * To use it, you have to estimate the number of concurrent anonymous users of your application.
+ * The pool of users with the given size will be created. For each anonymous recommendations request,
+ * a user has to be taken from the pool and returned back immediately afterwards.
+ * </p>
+ *
+ * <p>
+ * If no more users are available in the pool, anonymous recommendations cannot be produced.
+ * </p>
+ *
+ * </p>
+ *
+ * Setup:
+ * <pre>
+ * int concurrentUsers = 100;
+ * DataModel realModel = ..
+ * PlusAnonymousConcurrentUserDataModel plusModel =
+ * new PlusAnonymousConcurrentUserDataModel(realModel, concurrentUsers);
+ * Recommender recommender = ...;
+ * </pre>
+ *
+ * Real-time recommendation:
+ * <pre>
+ * PlusAnonymousConcurrentUserDataModel plusModel =
+ * (PlusAnonymousConcurrentUserDataModel) recommender.getDataModel();
+ *
+ * // Take the next available anonymous user from the pool
+ * Long anonymousUserID = plusModel.takeAvailableUser();
+ *
+ * PreferenceArray tempPrefs = ..
+ * tempPrefs.setUserID(0, anonymousUserID);
+ * tempPrefs.setItemID(0, itemID);
+ * plusModel.setTempPrefs(tempPrefs, anonymousUserID);
+ *
+ * // Produce recommendations
+ * recommender.recommend(anonymousUserID, howMany);
+ *
+ * // It is very IMPORTANT to release user back to the pool
+ * plusModel.releaseUser(anonymousUserID);
+ * </pre>
+ *
+ * </p>
+ */
+public final class PlusAnonymousConcurrentUserDataModel extends PlusAnonymousUserDataModel {
+
+ /** Preferences for all anonymous users */
+ private final Map<Long,PreferenceArray> tempPrefs;
+ /** Item IDs set for all anonymous users */
+ private final Map<Long,FastIDSet> prefItemIDs;
+ /** Pool of the users (FIFO) */
+ private Queue<Long> usersPool;
+
+ private static final Logger log = LoggerFactory.getLogger(PlusAnonymousUserDataModel.class);
+
+ /**
+ * @param delegate Real model where anonymous users will be added to
+ * @param maxConcurrentUsers Maximum allowed number of concurrent anonymous users
+ */
+ public PlusAnonymousConcurrentUserDataModel(DataModel delegate, int maxConcurrentUsers) {
+ super(delegate);
+
+ tempPrefs = new ConcurrentHashMap<>();
+ prefItemIDs = new ConcurrentHashMap<>();
+
+ initializeUsersPools(maxConcurrentUsers);
+ }
+
+ /**
+ * Initialize the pool of concurrent anonymous users.
+ *
+ * @param usersPoolSize Maximum allowed number of concurrent anonymous user. Depends on the consumer system.
+ */
+ private void initializeUsersPools(int usersPoolSize) {
+ usersPool = new ConcurrentLinkedQueue<>();
+ for (int i = 0; i < usersPoolSize; i++) {
+ usersPool.add(TEMP_USER_ID + i);
+ }
+ }
+
+ /**
+ * Take the next available concurrent anonymous users from the pool.
+ *
+ * @return User ID or null if no more users are available
+ */
+ public Long takeAvailableUser() {
+ Long takenUserID = usersPool.poll();
+ if (takenUserID != null) {
+ // Initialize the preferences array to indicate that the user is taken.
+ tempPrefs.put(takenUserID, new GenericUserPreferenceArray(0));
+ return takenUserID;
+ }
+ return null;
+ }
+
+ /**
+ * Release previously taken anonymous user and return it to the pool.
+ *
+ * @param userID ID of a previously taken anonymous user
+ * @return true if the user was previously taken, false otherwise
+ */
+ public boolean releaseUser(Long userID) {
+ if (tempPrefs.containsKey(userID)) {
+ this.clearTempPrefs(userID);
+ // Return previously taken user to the pool
+ usersPool.offer(userID);
+ return true;
+ }
+ return false;
+ }
+
+ /**
+ * Checks whether a given user is a valid previously acquired anonymous user.
+ */
+ private boolean isAnonymousUser(long userID) {
+ return tempPrefs.containsKey(userID);
+ }
+
+ /**
+ * Sets temporary preferences for a given anonymous user.
+ */
+ public void setTempPrefs(PreferenceArray prefs, long anonymousUserID) {
+ Preconditions.checkArgument(prefs != null && prefs.length() > 0, "prefs is null or empty");
+
+ this.tempPrefs.put(anonymousUserID, prefs);
+ FastIDSet userPrefItemIDs = new FastIDSet();
+
+ for (int i = 0; i < prefs.length(); i++) {
+ userPrefItemIDs.add(prefs.getItemID(i));
+ }
+
+ this.prefItemIDs.put(anonymousUserID, userPrefItemIDs);
+ }
+
+ /**
+ * Clears temporary preferences for a given anonymous user.
+ */
+ public void clearTempPrefs(long anonymousUserID) {
+ this.tempPrefs.remove(anonymousUserID);
+ this.prefItemIDs.remove(anonymousUserID);
+ }
+
+ @Override
+ public LongPrimitiveIterator getUserIDs() throws TasteException {
+ // Anonymous users have short lifetime and should not be included into the neighbohoods of the real users.
+ // Thus exclude them from the universe.
+ return getDelegate().getUserIDs();
+ }
+
+ @Override
+ public PreferenceArray getPreferencesFromUser(long userID) throws TasteException {
+ if (isAnonymousUser(userID)) {
+ return tempPrefs.get(userID);
+ }
+ return getDelegate().getPreferencesFromUser(userID);
+ }
+
+ @Override
+ public FastIDSet getItemIDsFromUser(long userID) throws TasteException {
+ if (isAnonymousUser(userID)) {
+ return prefItemIDs.get(userID);
+ }
+ return getDelegate().getItemIDsFromUser(userID);
+ }
+
+ @Override
+ public PreferenceArray getPreferencesForItem(long itemID) throws TasteException {
+ if (tempPrefs.isEmpty()) {
+ return getDelegate().getPreferencesForItem(itemID);
+ }
+
+ PreferenceArray delegatePrefs = null;
+
+ try {
+ delegatePrefs = getDelegate().getPreferencesForItem(itemID);
+ } catch (NoSuchItemException nsie) {
+ // OK. Probably an item that only the anonymous user has
+ if (log.isDebugEnabled()) {
+ log.debug("Item {} unknown", itemID);
+ }
+ }
+
+ List<Preference> anonymousPreferences = Lists.newArrayList();
+
+ for (Map.Entry<Long, PreferenceArray> prefsMap : tempPrefs.entrySet()) {
+ PreferenceArray singleUserTempPrefs = prefsMap.getValue();
+ for (int i = 0; i < singleUserTempPrefs.length(); i++) {
+ if (singleUserTempPrefs.getItemID(i) == itemID) {
+ anonymousPreferences.add(singleUserTempPrefs.get(i));
+ }
+ }
+ }
+
+ int delegateLength = delegatePrefs == null ? 0 : delegatePrefs.length();
+ int anonymousPrefsLength = anonymousPreferences.size();
+ int prefsCounter = 0;
+
+ // Merge the delegate and anonymous preferences into a single array
+ PreferenceArray newPreferenceArray = new GenericItemPreferenceArray(delegateLength + anonymousPrefsLength);
+
+ for (int i = 0; i < delegateLength; i++) {
+ newPreferenceArray.set(prefsCounter++, delegatePrefs.get(i));
+ }
+
+ for (Preference anonymousPreference : anonymousPreferences) {
+ newPreferenceArray.set(prefsCounter++, anonymousPreference);
+ }
+
+ if (newPreferenceArray.length() == 0) {
+ // No, didn't find it among the anonymous user prefs
+ throw new NoSuchItemException(itemID);
+ }
+
+ return newPreferenceArray;
+ }
+
+ @Override
+ public Float getPreferenceValue(long userID, long itemID) throws TasteException {
+ if (isAnonymousUser(userID)) {
+ PreferenceArray singleUserTempPrefs = tempPrefs.get(userID);
+ for (int i = 0; i < singleUserTempPrefs.length(); i++) {
+ if (singleUserTempPrefs.getItemID(i) == itemID) {
+ return singleUserTempPrefs.getValue(i);
+ }
+ }
+ return null;
+ }
+ return getDelegate().getPreferenceValue(userID, itemID);
+ }
+
+ @Override
+ public Long getPreferenceTime(long userID, long itemID) throws TasteException {
+ if (isAnonymousUser(userID)) {
+ // Timestamps are not saved for anonymous preferences
+ return null;
+ }
+ return getDelegate().getPreferenceTime(userID, itemID);
+ }
+
+ @Override
+ public int getNumUsers() throws TasteException {
+ // Anonymous users have short lifetime and should not be included into the neighbohoods of the real users.
+ // Thus exclude them from the universe.
+ return getDelegate().getNumUsers();
+ }
+
+ @Override
+ public int getNumUsersWithPreferenceFor(long itemID) throws TasteException {
+ if (tempPrefs.isEmpty()) {
+ return getDelegate().getNumUsersWithPreferenceFor(itemID);
+ }
+
+ int countAnonymousUsersWithPreferenceFor = 0;
+
+ for (Map.Entry<Long, PreferenceArray> singleUserTempPrefs : tempPrefs.entrySet()) {
+ for (int i = 0; i < singleUserTempPrefs.getValue().length(); i++) {
+ if (singleUserTempPrefs.getValue().getItemID(i) == itemID) {
+ countAnonymousUsersWithPreferenceFor++;
+ break;
+ }
+ }
+ }
+ return getDelegate().getNumUsersWithPreferenceFor(itemID) + countAnonymousUsersWithPreferenceFor;
+ }
+
+ @Override
+ public int getNumUsersWithPreferenceFor(long itemID1, long itemID2) throws TasteException {
+ if (tempPrefs.isEmpty()) {
+ return getDelegate().getNumUsersWithPreferenceFor(itemID1, itemID2);
+ }
+
+ int countAnonymousUsersWithPreferenceFor = 0;
+
+ for (Map.Entry<Long, PreferenceArray> singleUserTempPrefs : tempPrefs.entrySet()) {
+ boolean found1 = false;
+ boolean found2 = false;
+ for (int i = 0; i < singleUserTempPrefs.getValue().length() && !(found1 && found2); i++) {
+ long itemID = singleUserTempPrefs.getValue().getItemID(i);
+ if (itemID == itemID1) {
+ found1 = true;
+ }
+ if (itemID == itemID2) {
+ found2 = true;
+ }
+ }
+
+ if (found1 && found2) {
+ countAnonymousUsersWithPreferenceFor++;
+ }
+ }
+
+ return getDelegate().getNumUsersWithPreferenceFor(itemID1, itemID2) + countAnonymousUsersWithPreferenceFor;
+ }
+
+ @Override
+ public void setPreference(long userID, long itemID, float value) throws TasteException {
+ if (isAnonymousUser(userID)) {
+ throw new UnsupportedOperationException();
+ }
+ getDelegate().setPreference(userID, itemID, value);
+ }
+
+ @Override
+ public void removePreference(long userID, long itemID) throws TasteException {
+ if (isAnonymousUser(userID)) {
+ throw new UnsupportedOperationException();
+ }
+ getDelegate().removePreference(userID, itemID);
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/PlusAnonymousUserDataModel.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/PlusAnonymousUserDataModel.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/PlusAnonymousUserDataModel.java
new file mode 100644
index 0000000..546349b
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/PlusAnonymousUserDataModel.java
@@ -0,0 +1,320 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.model;
+
+import java.util.Collection;
+
+import org.apache.mahout.cf.taste.common.NoSuchItemException;
+import org.apache.mahout.cf.taste.common.NoSuchUserException;
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+
+import com.google.common.base.Preconditions;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * <p>
+ * This {@link DataModel} decorator class is useful in a situation where you wish to recommend to a user that
+ * doesn't really exist yet in your actual {@link DataModel}. For example maybe you wish to recommend DVDs to
+ * a user who has browsed a few titles on your DVD store site, but, the user is not yet registered.
+ * </p>
+ *
+ * <p>
+ * This enables you to temporarily add a temporary user to an existing {@link DataModel} in a way that
+ * recommenders can then produce recommendations anyway. To do so, wrap your real implementation in this
+ * class:
+ * </p>
+ *
+ * <p>
+ *
+ * <pre>
+ * DataModel realModel = ...;
+ * DataModel plusModel = new PlusAnonymousUserDataModel(realModel);
+ * ...
+ * ItemSimilarity similarity = new LogLikelihoodSimilarity(realModel); // not plusModel
+ * </pre>
+ *
+ * </p>
+ *
+ * <p>
+ * But, you may continue to use {@code realModel} as input to other components. To recommend, first construct and
+ * set the temporary user information on the model and then simply call the recommender. The
+ * {@code synchronized} block exists to remind you that this is of course not thread-safe. Only one set
+ * of temp data can be inserted into the model and used at one time.
+ * </p>
+ *
+ * <p>
+ *
+ * <pre>
+ * Recommender recommender = ...;
+ * ...
+ * synchronized(...) {
+ * PreferenceArray tempPrefs = ...;
+ * plusModel.setTempPrefs(tempPrefs);
+ * recommender.recommend(PlusAnonymousUserDataModel.TEMP_USER_ID, 10);
+ * plusModel.setTempPrefs(null);
+ * }
+ * </pre>
+ *
+ * </p>
+ */
+public class PlusAnonymousUserDataModel implements DataModel {
+
+ public static final long TEMP_USER_ID = Long.MIN_VALUE;
+
+ private final DataModel delegate;
+ private PreferenceArray tempPrefs;
+ private final FastIDSet prefItemIDs;
+
+ private static final Logger log = LoggerFactory.getLogger(PlusAnonymousUserDataModel.class);
+
+ public PlusAnonymousUserDataModel(DataModel delegate) {
+ this.delegate = delegate;
+ this.prefItemIDs = new FastIDSet();
+ }
+
+ protected DataModel getDelegate() {
+ return delegate;
+ }
+
+ public void setTempPrefs(PreferenceArray prefs) {
+ Preconditions.checkArgument(prefs != null && prefs.length() > 0, "prefs is null or empty");
+ this.tempPrefs = prefs;
+ this.prefItemIDs.clear();
+ for (int i = 0; i < prefs.length(); i++) {
+ this.prefItemIDs.add(prefs.getItemID(i));
+ }
+ }
+
+ public void clearTempPrefs() {
+ tempPrefs = null;
+ prefItemIDs.clear();
+ }
+
+ @Override
+ public LongPrimitiveIterator getUserIDs() throws TasteException {
+ if (tempPrefs == null) {
+ return delegate.getUserIDs();
+ }
+ return new PlusAnonymousUserLongPrimitiveIterator(delegate.getUserIDs(), TEMP_USER_ID);
+ }
+
+ @Override
+ public PreferenceArray getPreferencesFromUser(long userID) throws TasteException {
+ if (userID == TEMP_USER_ID) {
+ if (tempPrefs == null) {
+ throw new NoSuchUserException(TEMP_USER_ID);
+ }
+ return tempPrefs;
+ }
+ return delegate.getPreferencesFromUser(userID);
+ }
+
+ @Override
+ public FastIDSet getItemIDsFromUser(long userID) throws TasteException {
+ if (userID == TEMP_USER_ID) {
+ if (tempPrefs == null) {
+ throw new NoSuchUserException(TEMP_USER_ID);
+ }
+ return prefItemIDs;
+ }
+ return delegate.getItemIDsFromUser(userID);
+ }
+
+ @Override
+ public LongPrimitiveIterator getItemIDs() throws TasteException {
+ return delegate.getItemIDs();
+ // Yeah ignoring items that only the plus-one user knows about... can't really happen
+ }
+
+ @Override
+ public PreferenceArray getPreferencesForItem(long itemID) throws TasteException {
+ if (tempPrefs == null) {
+ return delegate.getPreferencesForItem(itemID);
+ }
+ PreferenceArray delegatePrefs = null;
+ try {
+ delegatePrefs = delegate.getPreferencesForItem(itemID);
+ } catch (NoSuchItemException nsie) {
+ // OK. Probably an item that only the anonymous user has
+ if (log.isDebugEnabled()) {
+ log.debug("Item {} unknown", itemID);
+ }
+ }
+ for (int i = 0; i < tempPrefs.length(); i++) {
+ if (tempPrefs.getItemID(i) == itemID) {
+ return cloneAndMergeInto(delegatePrefs, itemID, tempPrefs.getUserID(i), tempPrefs.getValue(i));
+ }
+ }
+ if (delegatePrefs == null) {
+ // No, didn't find it among the anonymous user prefs
+ throw new NoSuchItemException(itemID);
+ }
+ return delegatePrefs;
+ }
+
+ private static PreferenceArray cloneAndMergeInto(PreferenceArray delegatePrefs,
+ long itemID,
+ long newUserID,
+ float value) {
+
+ int length = delegatePrefs == null ? 0 : delegatePrefs.length();
+ int newLength = length + 1;
+ PreferenceArray newPreferenceArray = new GenericItemPreferenceArray(newLength);
+
+ // Set item ID once
+ newPreferenceArray.setItemID(0, itemID);
+
+ int positionToInsert = 0;
+ while (positionToInsert < length && newUserID > delegatePrefs.getUserID(positionToInsert)) {
+ positionToInsert++;
+ }
+
+ for (int i = 0; i < positionToInsert; i++) {
+ newPreferenceArray.setUserID(i, delegatePrefs.getUserID(i));
+ newPreferenceArray.setValue(i, delegatePrefs.getValue(i));
+ }
+ newPreferenceArray.setUserID(positionToInsert, newUserID);
+ newPreferenceArray.setValue(positionToInsert, value);
+ for (int i = positionToInsert + 1; i < newLength; i++) {
+ newPreferenceArray.setUserID(i, delegatePrefs.getUserID(i - 1));
+ newPreferenceArray.setValue(i, delegatePrefs.getValue(i - 1));
+ }
+
+ return newPreferenceArray;
+ }
+
+ @Override
+ public Float getPreferenceValue(long userID, long itemID) throws TasteException {
+ if (userID == TEMP_USER_ID) {
+ if (tempPrefs == null) {
+ throw new NoSuchUserException(TEMP_USER_ID);
+ }
+ for (int i = 0; i < tempPrefs.length(); i++) {
+ if (tempPrefs.getItemID(i) == itemID) {
+ return tempPrefs.getValue(i);
+ }
+ }
+ return null;
+ }
+ return delegate.getPreferenceValue(userID, itemID);
+ }
+
+ @Override
+ public Long getPreferenceTime(long userID, long itemID) throws TasteException {
+ if (userID == TEMP_USER_ID) {
+ if (tempPrefs == null) {
+ throw new NoSuchUserException(TEMP_USER_ID);
+ }
+ return null;
+ }
+ return delegate.getPreferenceTime(userID, itemID);
+ }
+
+ @Override
+ public int getNumItems() throws TasteException {
+ return delegate.getNumItems();
+ }
+
+ @Override
+ public int getNumUsers() throws TasteException {
+ return delegate.getNumUsers() + (tempPrefs == null ? 0 : 1);
+ }
+
+ @Override
+ public int getNumUsersWithPreferenceFor(long itemID) throws TasteException {
+ if (tempPrefs == null) {
+ return delegate.getNumUsersWithPreferenceFor(itemID);
+ }
+ boolean found = false;
+ for (int i = 0; i < tempPrefs.length(); i++) {
+ if (tempPrefs.getItemID(i) == itemID) {
+ found = true;
+ break;
+ }
+ }
+ return delegate.getNumUsersWithPreferenceFor(itemID) + (found ? 1 : 0);
+ }
+
+ @Override
+ public int getNumUsersWithPreferenceFor(long itemID1, long itemID2) throws TasteException {
+ if (tempPrefs == null) {
+ return delegate.getNumUsersWithPreferenceFor(itemID1, itemID2);
+ }
+ boolean found1 = false;
+ boolean found2 = false;
+ for (int i = 0; i < tempPrefs.length() && !(found1 && found2); i++) {
+ long itemID = tempPrefs.getItemID(i);
+ if (itemID == itemID1) {
+ found1 = true;
+ }
+ if (itemID == itemID2) {
+ found2 = true;
+ }
+ }
+ return delegate.getNumUsersWithPreferenceFor(itemID1, itemID2) + (found1 && found2 ? 1 : 0);
+ }
+
+ @Override
+ public void setPreference(long userID, long itemID, float value) throws TasteException {
+ if (userID == TEMP_USER_ID) {
+ if (tempPrefs == null) {
+ throw new NoSuchUserException(TEMP_USER_ID);
+ }
+ throw new UnsupportedOperationException();
+ }
+ delegate.setPreference(userID, itemID, value);
+ }
+
+ @Override
+ public void removePreference(long userID, long itemID) throws TasteException {
+ if (userID == TEMP_USER_ID) {
+ if (tempPrefs == null) {
+ throw new NoSuchUserException(TEMP_USER_ID);
+ }
+ throw new UnsupportedOperationException();
+ }
+ delegate.removePreference(userID, itemID);
+ }
+
+ @Override
+ public void refresh(Collection<Refreshable> alreadyRefreshed) {
+ delegate.refresh(alreadyRefreshed);
+ }
+
+ @Override
+ public boolean hasPreferenceValues() {
+ return delegate.hasPreferenceValues();
+ }
+
+ @Override
+ public float getMaxPreference() {
+ return delegate.getMaxPreference();
+ }
+
+ @Override
+ public float getMinPreference() {
+ return delegate.getMinPreference();
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/PlusAnonymousUserLongPrimitiveIterator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/PlusAnonymousUserLongPrimitiveIterator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/PlusAnonymousUserLongPrimitiveIterator.java
new file mode 100644
index 0000000..ea4df85
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/PlusAnonymousUserLongPrimitiveIterator.java
@@ -0,0 +1,90 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.model;
+
+import org.apache.mahout.cf.taste.impl.common.AbstractLongPrimitiveIterator;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+
+final class PlusAnonymousUserLongPrimitiveIterator extends AbstractLongPrimitiveIterator {
+
+ private final LongPrimitiveIterator delegate;
+ private final long extraDatum;
+ private boolean datumConsumed;
+
+ PlusAnonymousUserLongPrimitiveIterator(LongPrimitiveIterator delegate, long extraDatum) {
+ this.delegate = delegate;
+ this.extraDatum = extraDatum;
+ datumConsumed = false;
+ }
+
+ @Override
+ public long nextLong() {
+ if (datumConsumed) {
+ return delegate.nextLong();
+ } else {
+ if (delegate.hasNext()) {
+ long delegateNext = delegate.peek();
+ if (extraDatum <= delegateNext) {
+ datumConsumed = true;
+ return extraDatum;
+ } else {
+ return delegate.next();
+ }
+ } else {
+ datumConsumed = true;
+ return extraDatum;
+ }
+ }
+ }
+
+ @Override
+ public long peek() {
+ if (datumConsumed) {
+ return delegate.peek();
+ } else {
+ if (delegate.hasNext()) {
+ long delegateNext = delegate.peek();
+ if (extraDatum <= delegateNext) {
+ return extraDatum;
+ } else {
+ return delegateNext;
+ }
+ } else {
+ return extraDatum;
+ }
+ }
+ }
+
+ @Override
+ public boolean hasNext() {
+ return !datumConsumed || delegate.hasNext();
+ }
+
+ @Override
+ public void remove() {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public void skip(int n) {
+ for (int i = 0; i < n; i++) {
+ nextLong();
+ }
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModel.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModel.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModel.java
new file mode 100644
index 0000000..0399618
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModel.java
@@ -0,0 +1,758 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.model.file;
+
+import java.io.File;
+import java.io.FileFilter;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.TreeMap;
+import java.util.concurrent.locks.ReentrantLock;
+
+import com.google.common.base.Preconditions;
+import com.google.common.base.Splitter;
+import com.google.common.io.Closeables;
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+import org.apache.mahout.cf.taste.impl.model.AbstractDataModel;
+import org.apache.mahout.cf.taste.impl.model.GenericBooleanPrefDataModel;
+import org.apache.mahout.cf.taste.impl.model.GenericDataModel;
+import org.apache.mahout.cf.taste.impl.model.GenericPreference;
+import org.apache.mahout.cf.taste.impl.model.GenericUserPreferenceArray;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.Preference;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.common.iterator.FileLineIterator;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * <p>
+ * A {@link DataModel} backed by a delimited file. This class expects a file where each line
+ * contains a user ID, followed by item ID, followed by optional preference value, followed by
+ * optional timestamp. Commas or tabs delimit fields:
+ * </p>
+ *
+ * <p>{@code userID,itemID[,preference[,timestamp]]}</p>
+ *
+ * <p>
+ * Preference value is optional to accommodate applications that have no notion of a
+ * preference value (that is, the user simply expresses a
+ * preference for an item, but no degree of preference).
+ * </p>
+ *
+ * <p>
+ * The preference value is assumed to be parseable as a {@code double}. The user IDs and item IDs are
+ * read parsed as {@code long}s. The timestamp, if present, is assumed to be parseable as a
+ * {@code long}, though this can be overridden via {@link #readTimestampFromString(String)}.
+ * The preference value may be empty, to indicate "no preference value", but cannot be empty. That is,
+ * this is legal:
+ * </p>
+ *
+ * <p>{@code 123,456,,129050099059}</p>
+ *
+ * <p>But this isn't:</p>
+ *
+ * <p>{@code 123,456,129050099059}</p>
+ *
+ * <p>
+ * It is also acceptable for the lines to contain additional fields. Fields beyond the third will be ignored.
+ * An empty line, or one that begins with '#' will be ignored as a comment.
+ * </p>
+ *
+ * <p>
+ * This class will reload data from the data file when {@link #refresh(Collection)} is called, unless the file
+ * has been reloaded very recently already.
+ * </p>
+ *
+ * <p>
+ * This class will also look for update "delta" files in the same directory, with file names that start the
+ * same way (up to the first period). These files have the same format, and provide updated data that
+ * supersedes what is in the main data file. This is a mechanism that allows an application to push updates to
+ * {@link FileDataModel} without re-copying the entire data file.
+ * </p>
+ *
+ * <p>
+ * One small format difference exists. Update files must also be able to express deletes.
+ * This is done by ending with a blank preference value, as in "123,456,".
+ * </p>
+ *
+ * <p>
+ * Note that it's all-or-nothing -- all of the items in the file must express no preference, or the all must.
+ * These cannot be mixed. Put another way there will always be the same number of delimiters on every line of
+ * the file!
+ * </p>
+ *
+ * <p>
+ * This class is not intended for use with very large amounts of data (over, say, tens of millions of rows).
+ * For that, a JDBC-backed {@link DataModel} and a database are more appropriate.
+ * </p>
+ *
+ * <p>
+ * It is possible and likely useful to subclass this class and customize its behavior to accommodate
+ * application-specific needs and input formats. See {@link #processLine(String, FastByIDMap, FastByIDMap, boolean)} and
+ * {@link #processLineWithoutID(String, FastByIDMap, FastByIDMap)}
+ */
+public class FileDataModel extends AbstractDataModel {
+
+ private static final Logger log = LoggerFactory.getLogger(FileDataModel.class);
+
+ public static final long DEFAULT_MIN_RELOAD_INTERVAL_MS = 60 * 1000L; // 1 minute?
+ private static final char COMMENT_CHAR = '#';
+ private static final char[] DELIMIETERS = {',', '\t'};
+
+ private final File dataFile;
+ private long lastModified;
+ private long lastUpdateFileModified;
+ private final transient Splitter delimiterPattern;
+ private final boolean hasPrefValues;
+ private DataModel delegate;
+ private final ReentrantLock reloadLock;
+ private final boolean transpose;
+ private final long minReloadIntervalMS;
+
+ /**
+ * @param dataFile
+ * file containing preferences data. If file is compressed (and name ends in .gz or .zip
+ * accordingly) it will be decompressed as it is read)
+ * @throws FileNotFoundException
+ * if dataFile does not exist
+ * @throws IOException
+ * if file can't be read
+ */
+ public FileDataModel(File dataFile) throws IOException {
+ this(dataFile, false, DEFAULT_MIN_RELOAD_INTERVAL_MS);
+ }
+
+ /**
+ * @param delimiterRegex If your data file don't use '\t' or ',' as delimiter, you can specify
+ * a custom regex pattern.
+ */
+ public FileDataModel(File dataFile, String delimiterRegex) throws IOException {
+ this(dataFile, false, DEFAULT_MIN_RELOAD_INTERVAL_MS, delimiterRegex);
+ }
+
+ /**
+ * @param transpose
+ * transposes user IDs and item IDs -- convenient for 'flipping' the data model this way
+ * @param minReloadIntervalMS
+ * the minimum interval in milliseconds after which a full reload of the original datafile is done
+ * when refresh() is called
+ * @see #FileDataModel(File)
+ */
+ public FileDataModel(File dataFile, boolean transpose, long minReloadIntervalMS) throws IOException {
+ this(dataFile, transpose, minReloadIntervalMS, null);
+ }
+
+ /**
+ * @param delimiterRegex If your data file don't use '\t' or ',' as delimiters, you can specify
+ * user own using regex pattern.
+ * @throws IOException
+ */
+ public FileDataModel(File dataFile, boolean transpose, long minReloadIntervalMS, String delimiterRegex)
+ throws IOException {
+
+ this.dataFile = Preconditions.checkNotNull(dataFile.getAbsoluteFile());
+ if (!dataFile.exists() || dataFile.isDirectory()) {
+ throw new FileNotFoundException(dataFile.toString());
+ }
+ Preconditions.checkArgument(dataFile.length() > 0L, "dataFile is empty");
+ Preconditions.checkArgument(minReloadIntervalMS >= 0L, "minReloadIntervalMs must be non-negative");
+
+ log.info("Creating FileDataModel for file {}", dataFile);
+
+ this.lastModified = dataFile.lastModified();
+ this.lastUpdateFileModified = readLastUpdateFileModified();
+
+ FileLineIterator iterator = new FileLineIterator(dataFile, false);
+ String firstLine = iterator.peek();
+ while (firstLine.isEmpty() || firstLine.charAt(0) == COMMENT_CHAR) {
+ iterator.next();
+ firstLine = iterator.peek();
+ }
+ Closeables.close(iterator, true);
+
+ char delimiter;
+ if (delimiterRegex == null) {
+ delimiter = determineDelimiter(firstLine);
+ delimiterPattern = Splitter.on(delimiter);
+ } else {
+ delimiter = '\0';
+ delimiterPattern = Splitter.onPattern(delimiterRegex);
+ if (!delimiterPattern.split(firstLine).iterator().hasNext()) {
+ throw new IllegalArgumentException("Did not find a delimiter(pattern) in first line");
+ }
+ }
+ List<String> firstLineSplit = new ArrayList<>();
+ for (String token : delimiterPattern.split(firstLine)) {
+ firstLineSplit.add(token);
+ }
+ // If preference value exists and isn't empty then the file is specifying pref values
+ hasPrefValues = firstLineSplit.size() >= 3 && !firstLineSplit.get(2).isEmpty();
+
+ this.reloadLock = new ReentrantLock();
+ this.transpose = transpose;
+ this.minReloadIntervalMS = minReloadIntervalMS;
+
+ reload();
+ }
+
+ public File getDataFile() {
+ return dataFile;
+ }
+
+ protected void reload() {
+ if (reloadLock.tryLock()) {
+ try {
+ delegate = buildModel();
+ } catch (IOException ioe) {
+ log.warn("Exception while reloading", ioe);
+ } finally {
+ reloadLock.unlock();
+ }
+ }
+ }
+
+ protected DataModel buildModel() throws IOException {
+
+ long newLastModified = dataFile.lastModified();
+ long newLastUpdateFileModified = readLastUpdateFileModified();
+
+ boolean loadFreshData = delegate == null || newLastModified > lastModified + minReloadIntervalMS;
+
+ long oldLastUpdateFileModifieid = lastUpdateFileModified;
+ lastModified = newLastModified;
+ lastUpdateFileModified = newLastUpdateFileModified;
+
+ FastByIDMap<FastByIDMap<Long>> timestamps = new FastByIDMap<>();
+
+ if (hasPrefValues) {
+
+ if (loadFreshData) {
+
+ FastByIDMap<Collection<Preference>> data = new FastByIDMap<>();
+ FileLineIterator iterator = new FileLineIterator(dataFile, false);
+ processFile(iterator, data, timestamps, false);
+
+ for (File updateFile : findUpdateFilesAfter(newLastModified)) {
+ processFile(new FileLineIterator(updateFile, false), data, timestamps, false);
+ }
+
+ return new GenericDataModel(GenericDataModel.toDataMap(data, true), timestamps);
+
+ } else {
+
+ FastByIDMap<PreferenceArray> rawData = ((GenericDataModel) delegate).getRawUserData();
+
+ for (File updateFile : findUpdateFilesAfter(Math.max(oldLastUpdateFileModifieid, newLastModified))) {
+ processFile(new FileLineIterator(updateFile, false), rawData, timestamps, true);
+ }
+
+ return new GenericDataModel(rawData, timestamps);
+
+ }
+
+ } else {
+
+ if (loadFreshData) {
+
+ FastByIDMap<FastIDSet> data = new FastByIDMap<>();
+ FileLineIterator iterator = new FileLineIterator(dataFile, false);
+ processFileWithoutID(iterator, data, timestamps);
+
+ for (File updateFile : findUpdateFilesAfter(newLastModified)) {
+ processFileWithoutID(new FileLineIterator(updateFile, false), data, timestamps);
+ }
+
+ return new GenericBooleanPrefDataModel(data, timestamps);
+
+ } else {
+
+ FastByIDMap<FastIDSet> rawData = ((GenericBooleanPrefDataModel) delegate).getRawUserData();
+
+ for (File updateFile : findUpdateFilesAfter(Math.max(oldLastUpdateFileModifieid, newLastModified))) {
+ processFileWithoutID(new FileLineIterator(updateFile, false), rawData, timestamps);
+ }
+
+ return new GenericBooleanPrefDataModel(rawData, timestamps);
+
+ }
+
+ }
+ }
+
+ /**
+ * Finds update delta files in the same directory as the data file. This finds any file whose name starts
+ * the same way as the data file (up to first period) but isn't the data file itself. For example, if the
+ * data file is /foo/data.txt.gz, you might place update files at /foo/data.1.txt.gz, /foo/data.2.txt.gz,
+ * etc.
+ */
+ private Iterable<File> findUpdateFilesAfter(long minimumLastModified) {
+ String dataFileName = dataFile.getName();
+ int period = dataFileName.indexOf('.');
+ String startName = period < 0 ? dataFileName : dataFileName.substring(0, period);
+ File parentDir = dataFile.getParentFile();
+ Map<Long, File> modTimeToUpdateFile = new TreeMap<>();
+ FileFilter onlyFiles = new FileFilter() {
+ @Override
+ public boolean accept(File file) {
+ return !file.isDirectory();
+ }
+ };
+ for (File updateFile : parentDir.listFiles(onlyFiles)) {
+ String updateFileName = updateFile.getName();
+ if (updateFileName.startsWith(startName)
+ && !updateFileName.equals(dataFileName)
+ && updateFile.lastModified() >= minimumLastModified) {
+ modTimeToUpdateFile.put(updateFile.lastModified(), updateFile);
+ }
+ }
+ return modTimeToUpdateFile.values();
+ }
+
+ private long readLastUpdateFileModified() {
+ long mostRecentModification = Long.MIN_VALUE;
+ for (File updateFile : findUpdateFilesAfter(0L)) {
+ mostRecentModification = Math.max(mostRecentModification, updateFile.lastModified());
+ }
+ return mostRecentModification;
+ }
+
+ public static char determineDelimiter(String line) {
+ for (char possibleDelimieter : DELIMIETERS) {
+ if (line.indexOf(possibleDelimieter) >= 0) {
+ return possibleDelimieter;
+ }
+ }
+ throw new IllegalArgumentException("Did not find a delimiter in first line");
+ }
+
+ protected void processFile(FileLineIterator dataOrUpdateFileIterator,
+ FastByIDMap<?> data,
+ FastByIDMap<FastByIDMap<Long>> timestamps,
+ boolean fromPriorData) {
+ log.info("Reading file info...");
+ int count = 0;
+ while (dataOrUpdateFileIterator.hasNext()) {
+ String line = dataOrUpdateFileIterator.next();
+ if (!line.isEmpty()) {
+ processLine(line, data, timestamps, fromPriorData);
+ if (++count % 1000000 == 0) {
+ log.info("Processed {} lines", count);
+ }
+ }
+ }
+ log.info("Read lines: {}", count);
+ }
+
+ /**
+ * <p>
+ * Reads one line from the input file and adds the data to a {@link FastByIDMap} data structure which maps user IDs
+ * to preferences. This assumes that each line of the input file corresponds to one preference. After
+ * reading a line and determining which user and item the preference pertains to, the method should look to
+ * see if the data contains a mapping for the user ID already, and if not, add an empty data structure of preferences
+ * as appropriate to the data.
+ * </p>
+ *
+ * <p>
+ * Note that if the line is empty or begins with '#' it will be ignored as a comment.
+ * </p>
+ *
+ * @param line
+ * line from input data file
+ * @param data
+ * all data read so far, as a mapping from user IDs to preferences
+ * @param fromPriorData an implementation detail -- if true, data will map IDs to
+ * {@link PreferenceArray} since the framework is attempting to read and update raw
+ * data that is already in memory. Otherwise it maps to {@link Collection}s of
+ * {@link Preference}s, since it's reading fresh data. Subclasses must be prepared
+ * to handle this wrinkle.
+ */
+ protected void processLine(String line,
+ FastByIDMap<?> data,
+ FastByIDMap<FastByIDMap<Long>> timestamps,
+ boolean fromPriorData) {
+
+ // Ignore empty lines and comments
+ if (line.isEmpty() || line.charAt(0) == COMMENT_CHAR) {
+ return;
+ }
+
+ Iterator<String> tokens = delimiterPattern.split(line).iterator();
+ String userIDString = tokens.next();
+ String itemIDString = tokens.next();
+ String preferenceValueString = tokens.next();
+ boolean hasTimestamp = tokens.hasNext();
+ String timestampString = hasTimestamp ? tokens.next() : null;
+
+ long userID = readUserIDFromString(userIDString);
+ long itemID = readItemIDFromString(itemIDString);
+
+ if (transpose) {
+ long tmp = userID;
+ userID = itemID;
+ itemID = tmp;
+ }
+
+ // This is kind of gross but need to handle two types of storage
+ Object maybePrefs = data.get(userID);
+ if (fromPriorData) {
+ // Data are PreferenceArray
+
+ PreferenceArray prefs = (PreferenceArray) maybePrefs;
+ if (!hasTimestamp && preferenceValueString.isEmpty()) {
+ // Then line is of form "userID,itemID,", meaning remove
+ if (prefs != null) {
+ boolean exists = false;
+ int length = prefs.length();
+ for (int i = 0; i < length; i++) {
+ if (prefs.getItemID(i) == itemID) {
+ exists = true;
+ break;
+ }
+ }
+ if (exists) {
+ if (length == 1) {
+ data.remove(userID);
+ } else {
+ PreferenceArray newPrefs = new GenericUserPreferenceArray(length - 1);
+ for (int i = 0, j = 0; i < length; i++, j++) {
+ if (prefs.getItemID(i) == itemID) {
+ j--;
+ } else {
+ newPrefs.set(j, prefs.get(i));
+ }
+ }
+ ((FastByIDMap<PreferenceArray>) data).put(userID, newPrefs);
+ }
+ }
+ }
+
+ removeTimestamp(userID, itemID, timestamps);
+
+ } else {
+
+ float preferenceValue = Float.parseFloat(preferenceValueString);
+
+ boolean exists = false;
+ if (prefs != null) {
+ for (int i = 0; i < prefs.length(); i++) {
+ if (prefs.getItemID(i) == itemID) {
+ exists = true;
+ prefs.setValue(i, preferenceValue);
+ break;
+ }
+ }
+ }
+
+ if (!exists) {
+ if (prefs == null) {
+ prefs = new GenericUserPreferenceArray(1);
+ } else {
+ PreferenceArray newPrefs = new GenericUserPreferenceArray(prefs.length() + 1);
+ for (int i = 0, j = 1; i < prefs.length(); i++, j++) {
+ newPrefs.set(j, prefs.get(i));
+ }
+ prefs = newPrefs;
+ }
+ prefs.setUserID(0, userID);
+ prefs.setItemID(0, itemID);
+ prefs.setValue(0, preferenceValue);
+ ((FastByIDMap<PreferenceArray>) data).put(userID, prefs);
+ }
+ }
+
+ addTimestamp(userID, itemID, timestampString, timestamps);
+
+ } else {
+ // Data are Collection<Preference>
+
+ Collection<Preference> prefs = (Collection<Preference>) maybePrefs;
+
+ if (!hasTimestamp && preferenceValueString.isEmpty()) {
+ // Then line is of form "userID,itemID,", meaning remove
+ if (prefs != null) {
+ // remove pref
+ Iterator<Preference> prefsIterator = prefs.iterator();
+ while (prefsIterator.hasNext()) {
+ Preference pref = prefsIterator.next();
+ if (pref.getItemID() == itemID) {
+ prefsIterator.remove();
+ break;
+ }
+ }
+ }
+
+ removeTimestamp(userID, itemID, timestamps);
+
+ } else {
+
+ float preferenceValue = Float.parseFloat(preferenceValueString);
+
+ boolean exists = false;
+ if (prefs != null) {
+ for (Preference pref : prefs) {
+ if (pref.getItemID() == itemID) {
+ exists = true;
+ pref.setValue(preferenceValue);
+ break;
+ }
+ }
+ }
+
+ if (!exists) {
+ if (prefs == null) {
+ prefs = new ArrayList<>(2);
+ ((FastByIDMap<Collection<Preference>>) data).put(userID, prefs);
+ }
+ prefs.add(new GenericPreference(userID, itemID, preferenceValue));
+ }
+
+ addTimestamp(userID, itemID, timestampString, timestamps);
+
+ }
+
+ }
+ }
+
+ protected void processFileWithoutID(FileLineIterator dataOrUpdateFileIterator,
+ FastByIDMap<FastIDSet> data,
+ FastByIDMap<FastByIDMap<Long>> timestamps) {
+ log.info("Reading file info...");
+ int count = 0;
+ while (dataOrUpdateFileIterator.hasNext()) {
+ String line = dataOrUpdateFileIterator.next();
+ if (!line.isEmpty()) {
+ processLineWithoutID(line, data, timestamps);
+ if (++count % 100000 == 0) {
+ log.info("Processed {} lines", count);
+ }
+ }
+ }
+ log.info("Read lines: {}", count);
+ }
+
+ protected void processLineWithoutID(String line,
+ FastByIDMap<FastIDSet> data,
+ FastByIDMap<FastByIDMap<Long>> timestamps) {
+
+ if (line.isEmpty() || line.charAt(0) == COMMENT_CHAR) {
+ return;
+ }
+
+ Iterator<String> tokens = delimiterPattern.split(line).iterator();
+ String userIDString = tokens.next();
+ String itemIDString = tokens.next();
+ boolean hasPreference = tokens.hasNext();
+ String preferenceValueString = hasPreference ? tokens.next() : "";
+ boolean hasTimestamp = tokens.hasNext();
+ String timestampString = hasTimestamp ? tokens.next() : null;
+
+ long userID = readUserIDFromString(userIDString);
+ long itemID = readItemIDFromString(itemIDString);
+
+ if (transpose) {
+ long tmp = userID;
+ userID = itemID;
+ itemID = tmp;
+ }
+
+ if (hasPreference && !hasTimestamp && preferenceValueString.isEmpty()) {
+ // Then line is of form "userID,itemID,", meaning remove
+
+ FastIDSet itemIDs = data.get(userID);
+ if (itemIDs != null) {
+ itemIDs.remove(itemID);
+ }
+
+ removeTimestamp(userID, itemID, timestamps);
+
+ } else {
+
+ FastIDSet itemIDs = data.get(userID);
+ if (itemIDs == null) {
+ itemIDs = new FastIDSet(2);
+ data.put(userID, itemIDs);
+ }
+ itemIDs.add(itemID);
+
+ addTimestamp(userID, itemID, timestampString, timestamps);
+
+ }
+ }
+
+ private void addTimestamp(long userID,
+ long itemID,
+ String timestampString,
+ FastByIDMap<FastByIDMap<Long>> timestamps) {
+ if (timestampString != null) {
+ FastByIDMap<Long> itemTimestamps = timestamps.get(userID);
+ if (itemTimestamps == null) {
+ itemTimestamps = new FastByIDMap<>();
+ timestamps.put(userID, itemTimestamps);
+ }
+ long timestamp = readTimestampFromString(timestampString);
+ itemTimestamps.put(itemID, timestamp);
+ }
+ }
+
+ private static void removeTimestamp(long userID,
+ long itemID,
+ FastByIDMap<FastByIDMap<Long>> timestamps) {
+ FastByIDMap<Long> itemTimestamps = timestamps.get(userID);
+ if (itemTimestamps != null) {
+ itemTimestamps.remove(itemID);
+ }
+ }
+
+ /**
+ * Subclasses may wish to override this if ID values in the file are not numeric. This provides a hook by
+ * which subclasses can inject an {@link org.apache.mahout.cf.taste.model.IDMigrator} to perform
+ * translation.
+ */
+ protected long readUserIDFromString(String value) {
+ return Long.parseLong(value);
+ }
+
+ /**
+ * Subclasses may wish to override this if ID values in the file are not numeric. This provides a hook by
+ * which subclasses can inject an {@link org.apache.mahout.cf.taste.model.IDMigrator} to perform
+ * translation.
+ */
+ protected long readItemIDFromString(String value) {
+ return Long.parseLong(value);
+ }
+
+ /**
+ * Subclasses may wish to override this to change how time values in the input file are parsed.
+ * By default they are expected to be numeric, expressing a time as milliseconds since the epoch.
+ */
+ protected long readTimestampFromString(String value) {
+ return Long.parseLong(value);
+ }
+
+ @Override
+ public LongPrimitiveIterator getUserIDs() throws TasteException {
+ return delegate.getUserIDs();
+ }
+
+ @Override
+ public PreferenceArray getPreferencesFromUser(long userID) throws TasteException {
+ return delegate.getPreferencesFromUser(userID);
+ }
+
+ @Override
+ public FastIDSet getItemIDsFromUser(long userID) throws TasteException {
+ return delegate.getItemIDsFromUser(userID);
+ }
+
+ @Override
+ public LongPrimitiveIterator getItemIDs() throws TasteException {
+ return delegate.getItemIDs();
+ }
+
+ @Override
+ public PreferenceArray getPreferencesForItem(long itemID) throws TasteException {
+ return delegate.getPreferencesForItem(itemID);
+ }
+
+ @Override
+ public Float getPreferenceValue(long userID, long itemID) throws TasteException {
+ return delegate.getPreferenceValue(userID, itemID);
+ }
+
+ @Override
+ public Long getPreferenceTime(long userID, long itemID) throws TasteException {
+ return delegate.getPreferenceTime(userID, itemID);
+ }
+
+ @Override
+ public int getNumItems() throws TasteException {
+ return delegate.getNumItems();
+ }
+
+ @Override
+ public int getNumUsers() throws TasteException {
+ return delegate.getNumUsers();
+ }
+
+ @Override
+ public int getNumUsersWithPreferenceFor(long itemID) throws TasteException {
+ return delegate.getNumUsersWithPreferenceFor(itemID);
+ }
+
+ @Override
+ public int getNumUsersWithPreferenceFor(long itemID1, long itemID2) throws TasteException {
+ return delegate.getNumUsersWithPreferenceFor(itemID1, itemID2);
+ }
+
+ /**
+ * Note that this method only updates the in-memory preference data that this {@link FileDataModel}
+ * maintains; it does not modify any data on disk. Therefore any updates from this method are only
+ * temporary, and lost when data is reloaded from a file. This method should also be considered relatively
+ * slow.
+ */
+ @Override
+ public void setPreference(long userID, long itemID, float value) throws TasteException {
+ delegate.setPreference(userID, itemID, value);
+ }
+
+ /** See the warning at {@link #setPreference(long, long, float)}. */
+ @Override
+ public void removePreference(long userID, long itemID) throws TasteException {
+ delegate.removePreference(userID, itemID);
+ }
+
+ @Override
+ public void refresh(Collection<Refreshable> alreadyRefreshed) {
+ if (dataFile.lastModified() > lastModified + minReloadIntervalMS
+ || readLastUpdateFileModified() > lastUpdateFileModified + minReloadIntervalMS) {
+ log.debug("File has changed; reloading...");
+ reload();
+ }
+ }
+
+ @Override
+ public boolean hasPreferenceValues() {
+ return delegate.hasPreferenceValues();
+ }
+
+ @Override
+ public float getMaxPreference() {
+ return delegate.getMaxPreference();
+ }
+
+ @Override
+ public float getMinPreference() {
+ return delegate.getMinPreference();
+ }
+
+ @Override
+ public String toString() {
+ return "FileDataModel[dataFile:" + dataFile + ']';
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileIDMigrator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileIDMigrator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileIDMigrator.java
new file mode 100644
index 0000000..1bcb4ef
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileIDMigrator.java
@@ -0,0 +1,117 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.model.file;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.Collection;
+import java.util.concurrent.locks.ReentrantLock;
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
+import org.apache.mahout.cf.taste.impl.model.AbstractIDMigrator;
+import org.apache.mahout.common.iterator.FileLineIterable;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * <p>
+ * An {@link org.apache.mahout.cf.taste.model.IDMigrator} backed by a file.
+ * This class typically expects a file where each line
+ * contains a single stringID to be stored in this migrator.
+ * </p>
+ *
+ * <p>
+ * This class will reload data from the data file when {@link #refresh(Collection)} is called, unless the file
+ * has been reloaded very recently already.
+ * </p>
+ */
+public class FileIDMigrator extends AbstractIDMigrator {
+
+ public static final long DEFAULT_MIN_RELOAD_INTERVAL_MS = 60 * 1000L; // 1 minute?
+
+ private final File dataFile;
+ private FastByIDMap<String> longToString;
+ private final ReentrantLock reloadLock;
+
+ private long lastModified;
+ private final long minReloadIntervalMS;
+
+ private static final Logger log = LoggerFactory.getLogger(FileIDMigrator.class);
+
+ public FileIDMigrator(File dataFile) throws FileNotFoundException {
+ this(dataFile, DEFAULT_MIN_RELOAD_INTERVAL_MS);
+ }
+
+ public FileIDMigrator(File dataFile, long minReloadIntervalMS) throws FileNotFoundException {
+ longToString = new FastByIDMap<>(100);
+ this.dataFile = Preconditions.checkNotNull(dataFile);
+ if (!dataFile.exists() || dataFile.isDirectory()) {
+ throw new FileNotFoundException(dataFile.toString());
+ }
+
+ log.info("Creating FileReadonlyIDMigrator for file {}", dataFile);
+
+ this.reloadLock = new ReentrantLock();
+ this.lastModified = dataFile.lastModified();
+ this.minReloadIntervalMS = minReloadIntervalMS;
+
+ reload();
+ }
+
+ @Override
+ public String toStringID(long longID) {
+ return longToString.get(longID);
+ }
+
+ private void reload() {
+ if (reloadLock.tryLock()) {
+ try {
+ longToString = buildMapping();
+ } catch (IOException ioe) {
+ throw new IllegalStateException(ioe);
+ } finally {
+ reloadLock.unlock();
+ }
+ }
+ }
+
+ private FastByIDMap<String> buildMapping() throws IOException {
+ FastByIDMap<String> mapping = new FastByIDMap<>();
+ for (String line : new FileLineIterable(dataFile)) {
+ mapping.put(toLongID(line), line);
+ }
+ lastModified = dataFile.lastModified();
+ return mapping;
+ }
+
+ @Override
+ public void refresh(Collection<Refreshable> alreadyRefreshed) {
+ if (dataFile.lastModified() > lastModified + minReloadIntervalMS) {
+ log.debug("File has changed; reloading...");
+ reload();
+ }
+ }
+
+ @Override
+ public String toString() {
+ return "FileIDMigrator[dataFile:" + dataFile + ']';
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/neighborhood/AbstractUserNeighborhood.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/neighborhood/AbstractUserNeighborhood.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/neighborhood/AbstractUserNeighborhood.java
new file mode 100644
index 0000000..8d33f60
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/neighborhood/AbstractUserNeighborhood.java
@@ -0,0 +1,71 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.neighborhood;
+
+import java.util.Collection;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.impl.common.RefreshHelper;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.neighborhood.UserNeighborhood;
+import org.apache.mahout.cf.taste.similarity.UserSimilarity;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * <p>
+ * Contains methods and resources useful to all classes in this package.
+ * </p>
+ */
+abstract class AbstractUserNeighborhood implements UserNeighborhood {
+
+ private final UserSimilarity userSimilarity;
+ private final DataModel dataModel;
+ private final double samplingRate;
+ private final RefreshHelper refreshHelper;
+
+ AbstractUserNeighborhood(UserSimilarity userSimilarity, DataModel dataModel, double samplingRate) {
+ Preconditions.checkArgument(userSimilarity != null, "userSimilarity is null");
+ Preconditions.checkArgument(dataModel != null, "dataModel is null");
+ Preconditions.checkArgument(samplingRate > 0.0 && samplingRate <= 1.0, "samplingRate must be in (0,1]");
+ this.userSimilarity = userSimilarity;
+ this.dataModel = dataModel;
+ this.samplingRate = samplingRate;
+ this.refreshHelper = new RefreshHelper(null);
+ this.refreshHelper.addDependency(this.dataModel);
+ this.refreshHelper.addDependency(this.userSimilarity);
+ }
+
+ final UserSimilarity getUserSimilarity() {
+ return userSimilarity;
+ }
+
+ final DataModel getDataModel() {
+ return dataModel;
+ }
+
+ final double getSamplingRate() {
+ return samplingRate;
+ }
+
+ @Override
+ public final void refresh(Collection<Refreshable> alreadyRefreshed) {
+ refreshHelper.refresh(alreadyRefreshed);
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/neighborhood/CachingUserNeighborhood.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/neighborhood/CachingUserNeighborhood.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/neighborhood/CachingUserNeighborhood.java
new file mode 100644
index 0000000..998e476
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/neighborhood/CachingUserNeighborhood.java
@@ -0,0 +1,69 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.neighborhood;
+
+import java.util.Collection;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.Cache;
+import org.apache.mahout.cf.taste.impl.common.RefreshHelper;
+import org.apache.mahout.cf.taste.impl.common.Retriever;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.neighborhood.UserNeighborhood;
+
+import com.google.common.base.Preconditions;
+
+/** A caching wrapper around an underlying {@link UserNeighborhood} implementation. */
+public final class CachingUserNeighborhood implements UserNeighborhood {
+
+ private final UserNeighborhood neighborhood;
+ private final Cache<Long,long[]> neighborhoodCache;
+
+ public CachingUserNeighborhood(UserNeighborhood neighborhood, DataModel dataModel) throws TasteException {
+ Preconditions.checkArgument(neighborhood != null, "neighborhood is null");
+ this.neighborhood = neighborhood;
+ int maxCacheSize = dataModel.getNumUsers(); // just a dumb heuristic for sizing
+ this.neighborhoodCache = new Cache<>(new NeighborhoodRetriever(neighborhood), maxCacheSize);
+ }
+
+ @Override
+ public long[] getUserNeighborhood(long userID) throws TasteException {
+ return neighborhoodCache.get(userID);
+ }
+
+ @Override
+ public void refresh(Collection<Refreshable> alreadyRefreshed) {
+ neighborhoodCache.clear();
+ Collection<Refreshable> refreshed = RefreshHelper.buildRefreshed(alreadyRefreshed);
+ RefreshHelper.maybeRefresh(refreshed, neighborhood);
+ }
+
+ private static final class NeighborhoodRetriever implements Retriever<Long,long[]> {
+ private final UserNeighborhood neighborhood;
+
+ private NeighborhoodRetriever(UserNeighborhood neighborhood) {
+ this.neighborhood = neighborhood;
+ }
+
+ @Override
+ public long[] get(Long key) throws TasteException {
+ return neighborhood.getUserNeighborhood(key);
+ }
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/neighborhood/NearestNUserNeighborhood.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/neighborhood/NearestNUserNeighborhood.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/neighborhood/NearestNUserNeighborhood.java
new file mode 100644
index 0000000..7f3a98a
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/neighborhood/NearestNUserNeighborhood.java
@@ -0,0 +1,122 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.neighborhood;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+import org.apache.mahout.cf.taste.impl.common.SamplingLongPrimitiveIterator;
+import org.apache.mahout.cf.taste.impl.recommender.TopItems;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.similarity.UserSimilarity;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * <p>
+ * Computes a neighborhood consisting of the nearest n users to a given user. "Nearest" is defined by the
+ * given {@link UserSimilarity}.
+ * </p>
+ */
+public final class NearestNUserNeighborhood extends AbstractUserNeighborhood {
+
+ private final int n;
+ private final double minSimilarity;
+
+ /**
+ * @param n neighborhood size; capped at the number of users in the data model
+ * @throws IllegalArgumentException
+ * if {@code n < 1}, or userSimilarity or dataModel are {@code null}
+ */
+ public NearestNUserNeighborhood(int n, UserSimilarity userSimilarity, DataModel dataModel) throws TasteException {
+ this(n, Double.NEGATIVE_INFINITY, userSimilarity, dataModel, 1.0);
+ }
+
+ /**
+ * @param n neighborhood size; capped at the number of users in the data model
+ * @param minSimilarity minimal similarity required for neighbors
+ * @throws IllegalArgumentException
+ * if {@code n < 1}, or userSimilarity or dataModel are {@code null}
+ */
+ public NearestNUserNeighborhood(int n,
+ double minSimilarity,
+ UserSimilarity userSimilarity,
+ DataModel dataModel) throws TasteException {
+ this(n, minSimilarity, userSimilarity, dataModel, 1.0);
+ }
+
+ /**
+ * @param n neighborhood size; capped at the number of users in the data model
+ * @param minSimilarity minimal similarity required for neighbors
+ * @param samplingRate percentage of users to consider when building neighborhood -- decrease to trade quality for
+ * performance
+ * @throws IllegalArgumentException
+ * if {@code n < 1} or samplingRate is NaN or not in (0,1], or userSimilarity or dataModel are
+ * {@code null}
+ */
+ public NearestNUserNeighborhood(int n,
+ double minSimilarity,
+ UserSimilarity userSimilarity,
+ DataModel dataModel,
+ double samplingRate) throws TasteException {
+ super(userSimilarity, dataModel, samplingRate);
+ Preconditions.checkArgument(n >= 1, "n must be at least 1");
+ int numUsers = dataModel.getNumUsers();
+ this.n = n > numUsers ? numUsers : n;
+ this.minSimilarity = minSimilarity;
+ }
+
+ @Override
+ public long[] getUserNeighborhood(long userID) throws TasteException {
+
+ DataModel dataModel = getDataModel();
+ UserSimilarity userSimilarityImpl = getUserSimilarity();
+
+ TopItems.Estimator<Long> estimator = new Estimator(userSimilarityImpl, userID, minSimilarity);
+
+ LongPrimitiveIterator userIDs = SamplingLongPrimitiveIterator.maybeWrapIterator(dataModel.getUserIDs(),
+ getSamplingRate());
+
+ return TopItems.getTopUsers(n, userIDs, null, estimator);
+ }
+
+ @Override
+ public String toString() {
+ return "NearestNUserNeighborhood";
+ }
+
+ private static final class Estimator implements TopItems.Estimator<Long> {
+ private final UserSimilarity userSimilarityImpl;
+ private final long theUserID;
+ private final double minSim;
+
+ private Estimator(UserSimilarity userSimilarityImpl, long theUserID, double minSim) {
+ this.userSimilarityImpl = userSimilarityImpl;
+ this.theUserID = theUserID;
+ this.minSim = minSim;
+ }
+
+ @Override
+ public double estimate(Long userID) throws TasteException {
+ if (userID == theUserID) {
+ return Double.NaN;
+ }
+ double sim = userSimilarityImpl.userSimilarity(theUserID, userID);
+ return sim >= minSim ? sim : Double.NaN;
+ }
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/neighborhood/ThresholdUserNeighborhood.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/neighborhood/ThresholdUserNeighborhood.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/neighborhood/ThresholdUserNeighborhood.java
new file mode 100644
index 0000000..d5246e4
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/neighborhood/ThresholdUserNeighborhood.java
@@ -0,0 +1,104 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.neighborhood;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+import org.apache.mahout.cf.taste.impl.common.SamplingLongPrimitiveIterator;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.similarity.UserSimilarity;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * <p>
+ * Computes a neigbhorhood consisting of all users whose similarity to the given user meets or exceeds a
+ * certain threshold. Similarity is defined by the given {@link UserSimilarity}.
+ * </p>
+ */
+public final class ThresholdUserNeighborhood extends AbstractUserNeighborhood {
+
+ private final double threshold;
+
+ /**
+ * @param threshold
+ * similarity threshold
+ * @param userSimilarity
+ * similarity metric
+ * @param dataModel
+ * data model
+ * @throws IllegalArgumentException
+ * if threshold is {@link Double#NaN}, or if samplingRate is not positive and less than or equal
+ * to 1.0, or if userSimilarity or dataModel are {@code null}
+ */
+ public ThresholdUserNeighborhood(double threshold, UserSimilarity userSimilarity, DataModel dataModel) {
+ this(threshold, userSimilarity, dataModel, 1.0);
+ }
+
+ /**
+ * @param threshold
+ * similarity threshold
+ * @param userSimilarity
+ * similarity metric
+ * @param dataModel
+ * data model
+ * @param samplingRate
+ * percentage of users to consider when building neighborhood -- decrease to trade quality for
+ * performance
+ * @throws IllegalArgumentException
+ * if threshold or samplingRate is {@link Double#NaN}, or if samplingRate is not positive and less
+ * than or equal to 1.0, or if userSimilarity or dataModel are {@code null}
+ */
+ public ThresholdUserNeighborhood(double threshold,
+ UserSimilarity userSimilarity,
+ DataModel dataModel,
+ double samplingRate) {
+ super(userSimilarity, dataModel, samplingRate);
+ Preconditions.checkArgument(!Double.isNaN(threshold), "threshold must not be NaN");
+ this.threshold = threshold;
+ }
+
+ @Override
+ public long[] getUserNeighborhood(long userID) throws TasteException {
+
+ DataModel dataModel = getDataModel();
+ FastIDSet neighborhood = new FastIDSet();
+ LongPrimitiveIterator usersIterable = SamplingLongPrimitiveIterator.maybeWrapIterator(dataModel
+ .getUserIDs(), getSamplingRate());
+ UserSimilarity userSimilarityImpl = getUserSimilarity();
+
+ while (usersIterable.hasNext()) {
+ long otherUserID = usersIterable.next();
+ if (userID != otherUserID) {
+ double theSimilarity = userSimilarityImpl.userSimilarity(userID, otherUserID);
+ if (!Double.isNaN(theSimilarity) && theSimilarity >= threshold) {
+ neighborhood.add(otherUserID);
+ }
+ }
+ }
+
+ return neighborhood.toArray();
+ }
+
+ @Override
+ public String toString() {
+ return "ThresholdUserNeighborhood";
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/AbstractCandidateItemsStrategy.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/AbstractCandidateItemsStrategy.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/AbstractCandidateItemsStrategy.java
new file mode 100644
index 0000000..d24ea6a
--- /dev/null
+++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/AbstractCandidateItemsStrategy.java
@@ -0,0 +1,57 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.recommender;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.cf.taste.recommender.CandidateItemsStrategy;
+import org.apache.mahout.cf.taste.recommender.MostSimilarItemsCandidateItemsStrategy;
+
+import java.util.Collection;
+
+/**
+ * Abstract base implementation for retrieving candidate items to recommend
+ */
+public abstract class AbstractCandidateItemsStrategy implements CandidateItemsStrategy,
+ MostSimilarItemsCandidateItemsStrategy {
+
+ protected FastIDSet doGetCandidateItems(long[] preferredItemIDs, DataModel dataModel) throws TasteException{
+ return doGetCandidateItems(preferredItemIDs, dataModel, false);
+ }
+
+ @Override
+ public FastIDSet getCandidateItems(long userID, PreferenceArray preferencesFromUser, DataModel dataModel,
+ boolean includeKnownItems) throws TasteException {
+ return doGetCandidateItems(preferencesFromUser.getIDs(), dataModel, includeKnownItems);
+ }
+
+ @Override
+ public FastIDSet getCandidateItems(long[] itemIDs, DataModel dataModel)
+ throws TasteException {
+ return doGetCandidateItems(itemIDs, dataModel, false);
+ }
+
+ protected abstract FastIDSet doGetCandidateItems(long[] preferredItemIDs, DataModel dataModel,
+ boolean includeKnownItems) throws TasteException;
+
+ @Override
+ public void refresh(Collection<Refreshable> alreadyRefreshed) {}
+}
r***@apache.org
2018-06-28 14:55:14 UTC
Permalink
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/ParallelArraysSGDFactorizer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/ParallelArraysSGDFactorizer.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/ParallelArraysSGDFactorizer.java
deleted file mode 100644
index a99d54c..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/ParallelArraysSGDFactorizer.java
+++ /dev/null
@@ -1,265 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.kddcup.track1.svd;
-
-import org.apache.mahout.cf.taste.common.Refreshable;
-import org.apache.mahout.cf.taste.common.TasteException;
-import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
-import org.apache.mahout.cf.taste.impl.common.FullRunningAverage;
-import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
-import org.apache.mahout.cf.taste.impl.common.RunningAverage;
-import org.apache.mahout.cf.taste.impl.recommender.svd.Factorization;
-import org.apache.mahout.cf.taste.impl.recommender.svd.Factorizer;
-import org.apache.mahout.cf.taste.model.DataModel;
-import org.apache.mahout.cf.taste.model.Preference;
-import org.apache.mahout.common.RandomUtils;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.util.Collection;
-import java.util.Random;
-
-/**
- * {@link Factorizer} based on Simon Funk's famous article <a href="http://sifter.org/~simon/journal/20061211.html">
- * "Netflix Update: Try this at home"</a>.
- *
- * Attempts to be as memory efficient as possible, only iterating once through the
- * {@link FactorizablePreferences} or {@link DataModel} while copying everything to primitive arrays.
- * Learning works in place on these datastructures after that.
- */
-public class ParallelArraysSGDFactorizer implements Factorizer {
-
- public static final double DEFAULT_LEARNING_RATE = 0.005;
- public static final double DEFAULT_PREVENT_OVERFITTING = 0.02;
- public static final double DEFAULT_RANDOM_NOISE = 0.005;
-
- private final int numFeatures;
- private final int numIterations;
- private final float minPreference;
- private final float maxPreference;
-
- private final Random random;
- private final double learningRate;
- private final double preventOverfitting;
-
- private final FastByIDMap<Integer> userIDMapping;
- private final FastByIDMap<Integer> itemIDMapping;
-
- private final double[][] userFeatures;
- private final double[][] itemFeatures;
-
- private final int[] userIndexes;
- private final int[] itemIndexes;
- private final float[] values;
-
- private final double defaultValue;
- private final double interval;
- private final double[] cachedEstimates;
-
-
- private static final Logger log = LoggerFactory.getLogger(ParallelArraysSGDFactorizer.class);
-
- public ParallelArraysSGDFactorizer(DataModel dataModel, int numFeatures, int numIterations) {
- this(new DataModelFactorizablePreferences(dataModel), numFeatures, numIterations, DEFAULT_LEARNING_RATE,
- DEFAULT_PREVENT_OVERFITTING, DEFAULT_RANDOM_NOISE);
- }
-
- public ParallelArraysSGDFactorizer(DataModel dataModel, int numFeatures, int numIterations, double learningRate,
- double preventOverfitting, double randomNoise) {
- this(new DataModelFactorizablePreferences(dataModel), numFeatures, numIterations, learningRate, preventOverfitting,
- randomNoise);
- }
-
- public ParallelArraysSGDFactorizer(FactorizablePreferences factorizablePrefs, int numFeatures, int numIterations) {
- this(factorizablePrefs, numFeatures, numIterations, DEFAULT_LEARNING_RATE, DEFAULT_PREVENT_OVERFITTING,
- DEFAULT_RANDOM_NOISE);
- }
-
- public ParallelArraysSGDFactorizer(FactorizablePreferences factorizablePreferences, int numFeatures,
- int numIterations, double learningRate, double preventOverfitting, double randomNoise) {
-
- this.numFeatures = numFeatures;
- this.numIterations = numIterations;
- minPreference = factorizablePreferences.getMinPreference();
- maxPreference = factorizablePreferences.getMaxPreference();
-
- this.random = RandomUtils.getRandom();
- this.learningRate = learningRate;
- this.preventOverfitting = preventOverfitting;
-
- int numUsers = factorizablePreferences.numUsers();
- int numItems = factorizablePreferences.numItems();
- int numPrefs = factorizablePreferences.numPreferences();
-
- log.info("Mapping {} users...", numUsers);
- userIDMapping = new FastByIDMap<>(numUsers);
- int index = 0;
- LongPrimitiveIterator userIterator = factorizablePreferences.getUserIDs();
- while (userIterator.hasNext()) {
- userIDMapping.put(userIterator.nextLong(), index++);
- }
-
- log.info("Mapping {} items", numItems);
- itemIDMapping = new FastByIDMap<>(numItems);
- index = 0;
- LongPrimitiveIterator itemIterator = factorizablePreferences.getItemIDs();
- while (itemIterator.hasNext()) {
- itemIDMapping.put(itemIterator.nextLong(), index++);
- }
-
- this.userIndexes = new int[numPrefs];
- this.itemIndexes = new int[numPrefs];
- this.values = new float[numPrefs];
- this.cachedEstimates = new double[numPrefs];
-
- index = 0;
- log.info("Loading {} preferences into memory", numPrefs);
- RunningAverage average = new FullRunningAverage();
- for (Preference preference : factorizablePreferences.getPreferences()) {
- userIndexes[index] = userIDMapping.get(preference.getUserID());
- itemIndexes[index] = itemIDMapping.get(preference.getItemID());
- values[index] = preference.getValue();
- cachedEstimates[index] = 0;
-
- average.addDatum(preference.getValue());
-
- index++;
- if (index % 1000000 == 0) {
- log.info("Processed {} preferences", index);
- }
- }
- log.info("Processed {} preferences, done.", index);
-
- double averagePreference = average.getAverage();
- log.info("Average preference value is {}", averagePreference);
-
- double prefInterval = factorizablePreferences.getMaxPreference() - factorizablePreferences.getMinPreference();
- defaultValue = Math.sqrt((averagePreference - prefInterval * 0.1) / numFeatures);
- interval = prefInterval * 0.1 / numFeatures;
-
- userFeatures = new double[numUsers][numFeatures];
- itemFeatures = new double[numItems][numFeatures];
-
- log.info("Initializing feature vectors...");
- for (int feature = 0; feature < numFeatures; feature++) {
- for (int userIndex = 0; userIndex < numUsers; userIndex++) {
- userFeatures[userIndex][feature] = defaultValue + (random.nextDouble() - 0.5) * interval * randomNoise;
- }
- for (int itemIndex = 0; itemIndex < numItems; itemIndex++) {
- itemFeatures[itemIndex][feature] = defaultValue + (random.nextDouble() - 0.5) * interval * randomNoise;
- }
- }
- }
-
- @Override
- public Factorization factorize() throws TasteException {
- for (int feature = 0; feature < numFeatures; feature++) {
- log.info("Shuffling preferences...");
- shufflePreferences();
- log.info("Starting training of feature {} ...", feature);
- for (int currentIteration = 0; currentIteration < numIterations; currentIteration++) {
- if (currentIteration == numIterations - 1) {
- double rmse = trainingIterationWithRmse(feature);
- log.info("Finished training feature {} with RMSE {}", feature, rmse);
- } else {
- trainingIteration(feature);
- }
- }
- if (feature < numFeatures - 1) {
- log.info("Updating cache...");
- for (int index = 0; index < userIndexes.length; index++) {
- cachedEstimates[index] = estimate(userIndexes[index], itemIndexes[index], feature, cachedEstimates[index],
- false);
- }
- }
- }
- log.info("Factorization done");
- return new Factorization(userIDMapping, itemIDMapping, userFeatures, itemFeatures);
- }
-
- private void trainingIteration(int feature) {
- for (int index = 0; index < userIndexes.length; index++) {
- train(userIndexes[index], itemIndexes[index], feature, values[index], cachedEstimates[index]);
- }
- }
-
- private double trainingIterationWithRmse(int feature) {
- double rmse = 0.0;
- for (int index = 0; index < userIndexes.length; index++) {
- double error = train(userIndexes[index], itemIndexes[index], feature, values[index], cachedEstimates[index]);
- rmse += error * error;
- }
- return Math.sqrt(rmse / userIndexes.length);
- }
-
- private double estimate(int userIndex, int itemIndex, int feature, double cachedEstimate, boolean trailing) {
- double sum = cachedEstimate;
- sum += userFeatures[userIndex][feature] * itemFeatures[itemIndex][feature];
- if (trailing) {
- sum += (numFeatures - feature - 1) * (defaultValue + interval) * (defaultValue + interval);
- if (sum > maxPreference) {
- sum = maxPreference;
- } else if (sum < minPreference) {
- sum = minPreference;
- }
- }
- return sum;
- }
-
- public double train(int userIndex, int itemIndex, int feature, double original, double cachedEstimate) {
- double error = original - estimate(userIndex, itemIndex, feature, cachedEstimate, true);
- double[] userVector = userFeatures[userIndex];
- double[] itemVector = itemFeatures[itemIndex];
-
- userVector[feature] += learningRate * (error * itemVector[feature] - preventOverfitting * userVector[feature]);
- itemVector[feature] += learningRate * (error * userVector[feature] - preventOverfitting * itemVector[feature]);
-
- return error;
- }
-
- protected void shufflePreferences() {
- /* Durstenfeld shuffle */
- for (int currentPos = userIndexes.length - 1; currentPos > 0; currentPos--) {
- int swapPos = random.nextInt(currentPos + 1);
- swapPreferences(currentPos, swapPos);
- }
- }
-
- private void swapPreferences(int posA, int posB) {
- int tmpUserIndex = userIndexes[posA];
- int tmpItemIndex = itemIndexes[posA];
- float tmpValue = values[posA];
- double tmpEstimate = cachedEstimates[posA];
-
- userIndexes[posA] = userIndexes[posB];
- itemIndexes[posA] = itemIndexes[posB];
- values[posA] = values[posB];
- cachedEstimates[posA] = cachedEstimates[posB];
-
- userIndexes[posB] = tmpUserIndex;
- itemIndexes[posB] = tmpItemIndex;
- values[posB] = tmpValue;
- cachedEstimates[posB] = tmpEstimate;
- }
-
- @Override
- public void refresh(Collection<Refreshable> alreadyRefreshed) {
- // do nothing
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/Track1SVDRunner.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/Track1SVDRunner.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/Track1SVDRunner.java
deleted file mode 100644
index 5cce02d..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/Track1SVDRunner.java
+++ /dev/null
@@ -1,141 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.kddcup.track1.svd;
-
-import java.io.BufferedOutputStream;
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.OutputStream;
-
-import com.google.common.io.Closeables;
-import org.apache.mahout.cf.taste.common.NoSuchItemException;
-import org.apache.mahout.cf.taste.common.NoSuchUserException;
-import org.apache.mahout.cf.taste.example.kddcup.DataFileIterable;
-import org.apache.mahout.cf.taste.example.kddcup.KDDCupDataModel;
-import org.apache.mahout.cf.taste.example.kddcup.track1.EstimateConverter;
-import org.apache.mahout.cf.taste.impl.common.FullRunningAverage;
-import org.apache.mahout.cf.taste.impl.common.RunningAverage;
-import org.apache.mahout.cf.taste.impl.recommender.svd.Factorization;
-import org.apache.mahout.cf.taste.impl.recommender.svd.Factorizer;
-import org.apache.mahout.cf.taste.model.Preference;
-import org.apache.mahout.cf.taste.model.PreferenceArray;
-import org.apache.mahout.common.Pair;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * run an SVD factorization of the KDD track1 data.
- *
- * needs at least 6-7GB of memory, tested with -Xms6700M -Xmx6700M
- *
- */
-public final class Track1SVDRunner {
-
- private static final Logger log = LoggerFactory.getLogger(Track1SVDRunner.class);
-
- private Track1SVDRunner() {
- }
-
- public static void main(String[] args) throws Exception {
-
- if (args.length != 2) {
- System.err.println("Necessary arguments: <kddDataFileDirectory> <resultFile>");
- return;
- }
-
- File dataFileDirectory = new File(args[0]);
- if (!dataFileDirectory.exists() || !dataFileDirectory.isDirectory()) {
- throw new IllegalArgumentException("Bad data file directory: " + dataFileDirectory);
- }
-
- File resultFile = new File(args[1]);
-
- /* the knobs to turn */
- int numFeatures = 20;
- int numIterations = 5;
- double learningRate = 0.0001;
- double preventOverfitting = 0.002;
- double randomNoise = 0.0001;
-
-
- KDDCupFactorizablePreferences factorizablePreferences =
- new KDDCupFactorizablePreferences(KDDCupDataModel.getTrainingFile(dataFileDirectory));
-
- Factorizer sgdFactorizer = new ParallelArraysSGDFactorizer(factorizablePreferences, numFeatures, numIterations,
- learningRate, preventOverfitting, randomNoise);
-
- Factorization factorization = sgdFactorizer.factorize();
-
- log.info("Estimating validation preferences...");
- int prefsProcessed = 0;
- RunningAverage average = new FullRunningAverage();
- for (Pair<PreferenceArray,long[]> validationPair
- : new DataFileIterable(KDDCupDataModel.getValidationFile(dataFileDirectory))) {
- for (Preference validationPref : validationPair.getFirst()) {
- double estimate = estimatePreference(factorization, validationPref.getUserID(), validationPref.getItemID(),
- factorizablePreferences.getMinPreference(), factorizablePreferences.getMaxPreference());
- double error = validationPref.getValue() - estimate;
- average.addDatum(error * error);
- prefsProcessed++;
- if (prefsProcessed % 100000 == 0) {
- log.info("Computed {} estimations", prefsProcessed);
- }
- }
- }
- log.info("Computed {} estimations, done.", prefsProcessed);
-
- double rmse = Math.sqrt(average.getAverage());
- log.info("RMSE {}", rmse);
-
- log.info("Estimating test preferences...");
- OutputStream out = null;
- try {
- out = new BufferedOutputStream(new FileOutputStream(resultFile));
-
- for (Pair<PreferenceArray,long[]> testPair
- : new DataFileIterable(KDDCupDataModel.getTestFile(dataFileDirectory))) {
- for (Preference testPref : testPair.getFirst()) {
- double estimate = estimatePreference(factorization, testPref.getUserID(), testPref.getItemID(),
- factorizablePreferences.getMinPreference(), factorizablePreferences.getMaxPreference());
- byte result = EstimateConverter.convert(estimate, testPref.getUserID(), testPref.getItemID());
- out.write(result);
- }
- }
- } finally {
- Closeables.close(out, false);
- }
- log.info("wrote estimates to {}, done.", resultFile.getAbsolutePath());
- }
-
- static double estimatePreference(Factorization factorization, long userID, long itemID, float minPreference,
- float maxPreference) throws NoSuchUserException, NoSuchItemException {
- double[] userFeatures = factorization.getUserFeatures(userID);
- double[] itemFeatures = factorization.getItemFeatures(itemID);
- double estimate = 0;
- for (int feature = 0; feature < userFeatures.length; feature++) {
- estimate += userFeatures[feature] * itemFeatures[feature];
- }
- if (estimate < minPreference) {
- estimate = minPreference;
- } else if (estimate > maxPreference) {
- estimate = maxPreference;
- }
- return estimate;
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/HybridSimilarity.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/HybridSimilarity.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/HybridSimilarity.java
deleted file mode 100644
index ce025a9..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/HybridSimilarity.java
+++ /dev/null
@@ -1,62 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.kddcup.track2;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.Collection;
-
-import org.apache.mahout.cf.taste.common.Refreshable;
-import org.apache.mahout.cf.taste.common.TasteException;
-import org.apache.mahout.cf.taste.impl.similarity.AbstractItemSimilarity;
-import org.apache.mahout.cf.taste.impl.similarity.LogLikelihoodSimilarity;
-import org.apache.mahout.cf.taste.model.DataModel;
-import org.apache.mahout.cf.taste.similarity.ItemSimilarity;
-
-final class HybridSimilarity extends AbstractItemSimilarity {
-
- private final ItemSimilarity cfSimilarity;
- private final ItemSimilarity contentSimilarity;
-
- HybridSimilarity(DataModel dataModel, File dataFileDirectory) throws IOException {
- super(dataModel);
- cfSimilarity = new LogLikelihoodSimilarity(dataModel);
- contentSimilarity = new TrackItemSimilarity(dataFileDirectory);
- }
-
- @Override
- public double itemSimilarity(long itemID1, long itemID2) throws TasteException {
- return contentSimilarity.itemSimilarity(itemID1, itemID2) * cfSimilarity.itemSimilarity(itemID1, itemID2);
- }
-
- @Override
- public double[] itemSimilarities(long itemID1, long[] itemID2s) throws TasteException {
- double[] result = contentSimilarity.itemSimilarities(itemID1, itemID2s);
- double[] multipliers = cfSimilarity.itemSimilarities(itemID1, itemID2s);
- for (int i = 0; i < result.length; i++) {
- result[i] *= multipliers[i];
- }
- return result;
- }
-
- @Override
- public void refresh(Collection<Refreshable> alreadyRefreshed) {
- cfSimilarity.refresh(alreadyRefreshed);
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/Track2Callable.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/Track2Callable.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/Track2Callable.java
deleted file mode 100644
index 50fd35e..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/Track2Callable.java
+++ /dev/null
@@ -1,106 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.kddcup.track2;
-
-import org.apache.mahout.cf.taste.common.NoSuchItemException;
-import org.apache.mahout.cf.taste.common.TasteException;
-import org.apache.mahout.cf.taste.model.PreferenceArray;
-import org.apache.mahout.cf.taste.recommender.Recommender;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.HashSet;
-import java.util.List;
-import java.util.TreeMap;
-import java.util.concurrent.Callable;
-import java.util.concurrent.atomic.AtomicInteger;
-
-final class Track2Callable implements Callable<UserResult> {
-
- private static final Logger log = LoggerFactory.getLogger(Track2Callable.class);
- private static final AtomicInteger COUNT = new AtomicInteger();
-
- private final Recommender recommender;
- private final PreferenceArray userTest;
-
- Track2Callable(Recommender recommender, PreferenceArray userTest) {
- this.recommender = recommender;
- this.userTest = userTest;
- }
-
- @Override
- public UserResult call() throws TasteException {
-
- int testSize = userTest.length();
- if (testSize != 6) {
- throw new IllegalArgumentException("Expecting 6 items for user but got " + userTest);
- }
- long userID = userTest.get(0).getUserID();
- TreeMap<Double,Long> estimateToItemID = new TreeMap<>(Collections.reverseOrder());
-
- for (int i = 0; i < testSize; i++) {
- long itemID = userTest.getItemID(i);
- double estimate;
- try {
- estimate = recommender.estimatePreference(userID, itemID);
- } catch (NoSuchItemException nsie) {
- // OK in the sample data provided before the contest, should never happen otherwise
- log.warn("Unknown item {}; OK unless this is the real contest data", itemID);
- continue;
- }
-
- if (!Double.isNaN(estimate)) {
- estimateToItemID.put(estimate, itemID);
- }
- }
-
- Collection<Long> itemIDs = estimateToItemID.values();
- List<Long> topThree = new ArrayList<>(itemIDs);
- if (topThree.size() > 3) {
- topThree = topThree.subList(0, 3);
- } else if (topThree.size() < 3) {
- log.warn("Unable to recommend three items for {}", userID);
- // Some NaNs - just guess at the rest then
- Collection<Long> newItemIDs = new HashSet<>(3);
- newItemIDs.addAll(itemIDs);
- int i = 0;
- while (i < testSize && newItemIDs.size() < 3) {
- newItemIDs.add(userTest.getItemID(i));
- i++;
- }
- topThree = new ArrayList<>(newItemIDs);
- }
- if (topThree.size() != 3) {
- throw new IllegalStateException();
- }
-
- boolean[] result = new boolean[testSize];
- for (int i = 0; i < testSize; i++) {
- result[i] = topThree.contains(userTest.getItemID(i));
- }
-
- if (COUNT.incrementAndGet() % 1000 == 0) {
- log.info("Completed {} users", COUNT.get());
- }
-
- return new UserResult(userID, result);
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/Track2Recommender.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/Track2Recommender.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/Track2Recommender.java
deleted file mode 100644
index 185a00d..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/Track2Recommender.java
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.kddcup.track2;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.Collection;
-import java.util.List;
-
-import org.apache.mahout.cf.taste.common.Refreshable;
-import org.apache.mahout.cf.taste.common.TasteException;
-import org.apache.mahout.cf.taste.impl.recommender.GenericBooleanPrefItemBasedRecommender;
-import org.apache.mahout.cf.taste.model.DataModel;
-import org.apache.mahout.cf.taste.recommender.IDRescorer;
-import org.apache.mahout.cf.taste.recommender.RecommendedItem;
-import org.apache.mahout.cf.taste.recommender.Recommender;
-import org.apache.mahout.cf.taste.similarity.ItemSimilarity;
-
-public final class Track2Recommender implements Recommender {
-
- private final Recommender recommender;
-
- public Track2Recommender(DataModel dataModel, File dataFileDirectory) throws TasteException {
- // Change this to whatever you like!
- ItemSimilarity similarity;
- try {
- similarity = new HybridSimilarity(dataModel, dataFileDirectory);
- } catch (IOException ioe) {
- throw new TasteException(ioe);
- }
- recommender = new GenericBooleanPrefItemBasedRecommender(dataModel, similarity);
- }
-
- @Override
- public List<RecommendedItem> recommend(long userID, int howMany) throws TasteException {
- return recommender.recommend(userID, howMany);
- }
-
- @Override
- public List<RecommendedItem> recommend(long userID, int howMany, boolean includeKnownItems) throws TasteException {
- return recommend(userID, howMany, null, includeKnownItems);
- }
-
- @Override
- public List<RecommendedItem> recommend(long userID, int howMany, IDRescorer rescorer) throws TasteException {
- return recommender.recommend(userID, howMany, rescorer, false);
- }
-
- @Override
- public List<RecommendedItem> recommend(long userID, int howMany, IDRescorer rescorer, boolean includeKnownItems)
- throws TasteException {
- return recommender.recommend(userID, howMany, rescorer, includeKnownItems);
- }
-
- @Override
- public float estimatePreference(long userID, long itemID) throws TasteException {
- return recommender.estimatePreference(userID, itemID);
- }
-
- @Override
- public void setPreference(long userID, long itemID, float value) throws TasteException {
- recommender.setPreference(userID, itemID, value);
- }
-
- @Override
- public void removePreference(long userID, long itemID) throws TasteException {
- recommender.removePreference(userID, itemID);
- }
-
- @Override
- public DataModel getDataModel() {
- return recommender.getDataModel();
- }
-
- @Override
- public void refresh(Collection<Refreshable> alreadyRefreshed) {
- recommender.refresh(alreadyRefreshed);
- }
-
- @Override
- public String toString() {
- return "Track1Recommender[recommender:" + recommender + ']';
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/Track2RecommenderBuilder.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/Track2RecommenderBuilder.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/Track2RecommenderBuilder.java
deleted file mode 100644
index 09ade5d..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/Track2RecommenderBuilder.java
+++ /dev/null
@@ -1,33 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.kddcup.track2;
-
-import org.apache.mahout.cf.taste.common.TasteException;
-import org.apache.mahout.cf.taste.eval.RecommenderBuilder;
-import org.apache.mahout.cf.taste.example.kddcup.KDDCupDataModel;
-import org.apache.mahout.cf.taste.model.DataModel;
-import org.apache.mahout.cf.taste.recommender.Recommender;
-
-final class Track2RecommenderBuilder implements RecommenderBuilder {
-
- @Override
- public Recommender buildRecommender(DataModel dataModel) throws TasteException {
- return new Track2Recommender(dataModel, ((KDDCupDataModel) dataModel).getDataFileDirectory());
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/Track2Runner.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/Track2Runner.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/Track2Runner.java
deleted file mode 100644
index 3cbb61c..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/Track2Runner.java
+++ /dev/null
@@ -1,100 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.kddcup.track2;
-
-import org.apache.mahout.cf.taste.example.kddcup.DataFileIterable;
-import org.apache.mahout.cf.taste.example.kddcup.KDDCupDataModel;
-import org.apache.mahout.cf.taste.model.PreferenceArray;
-import org.apache.mahout.common.Pair;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.BufferedOutputStream;
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.OutputStream;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.List;
-import java.util.concurrent.ExecutorService;
-import java.util.concurrent.Executors;
-import java.util.concurrent.Future;
-
-/**
- * <p>Runs "track 2" of the KDD Cup competition using whatever recommender is inside {@link Track2Recommender}
- * and attempts to output the result in the correct contest format.</p>
- *
- * <p>Run as: {@code Track2Runner [track 2 data file directory] [output file]}</p>
- */
-public final class Track2Runner {
-
- private static final Logger log = LoggerFactory.getLogger(Track2Runner.class);
-
- private Track2Runner() {
- }
-
- public static void main(String[] args) throws Exception {
-
- File dataFileDirectory = new File(args[0]);
- if (!dataFileDirectory.exists() || !dataFileDirectory.isDirectory()) {
- throw new IllegalArgumentException("Bad data file directory: " + dataFileDirectory);
- }
-
- long start = System.currentTimeMillis();
-
- KDDCupDataModel model = new KDDCupDataModel(KDDCupDataModel.getTrainingFile(dataFileDirectory));
- Track2Recommender recommender = new Track2Recommender(model, dataFileDirectory);
-
- long end = System.currentTimeMillis();
- log.info("Loaded model in {}s", (end - start) / 1000);
- start = end;
-
- Collection<Track2Callable> callables = new ArrayList<>();
- for (Pair<PreferenceArray,long[]> tests : new DataFileIterable(KDDCupDataModel.getTestFile(dataFileDirectory))) {
- PreferenceArray userTest = tests.getFirst();
- callables.add(new Track2Callable(recommender, userTest));
- }
-
- int cores = Runtime.getRuntime().availableProcessors();
- log.info("Running on {} cores", cores);
- ExecutorService executor = Executors.newFixedThreadPool(cores);
- List<Future<UserResult>> futures = executor.invokeAll(callables);
- executor.shutdown();
-
- end = System.currentTimeMillis();
- log.info("Ran recommendations in {}s", (end - start) / 1000);
- start = end;
-
- try (OutputStream out = new BufferedOutputStream(new FileOutputStream(new File(args[1])))){
- long lastUserID = Long.MIN_VALUE;
- for (Future<UserResult> future : futures) {
- UserResult result = future.get();
- long userID = result.getUserID();
- if (userID <= lastUserID) {
- throw new IllegalStateException();
- }
- lastUserID = userID;
- out.write(result.getResultBytes());
- }
- }
-
- end = System.currentTimeMillis();
- log.info("Wrote output in {}s", (end - start) / 1000);
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/TrackData.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/TrackData.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/TrackData.java
deleted file mode 100644
index abd15f8..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/TrackData.java
+++ /dev/null
@@ -1,71 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.kddcup.track2;
-
-import java.util.regex.Pattern;
-
-import org.apache.mahout.cf.taste.impl.common.FastIDSet;
-
-final class TrackData {
-
- private static final Pattern PIPE = Pattern.compile("\\|");
- private static final String NO_VALUE = "None";
- static final long NO_VALUE_ID = Long.MIN_VALUE;
- private static final FastIDSet NO_GENRES = new FastIDSet();
-
- private final long trackID;
- private final long albumID;
- private final long artistID;
- private final FastIDSet genreIDs;
-
- TrackData(CharSequence line) {
- String[] tokens = PIPE.split(line);
- trackID = Long.parseLong(tokens[0]);
- albumID = parse(tokens[1]);
- artistID = parse(tokens[2]);
- if (tokens.length > 3) {
- genreIDs = new FastIDSet(tokens.length - 3);
- for (int i = 3; i < tokens.length; i++) {
- genreIDs.add(Long.parseLong(tokens[i]));
- }
- } else {
- genreIDs = NO_GENRES;
- }
- }
-
- private static long parse(String value) {
- return NO_VALUE.equals(value) ? NO_VALUE_ID : Long.parseLong(value);
- }
-
- public long getTrackID() {
- return trackID;
- }
-
- public long getAlbumID() {
- return albumID;
- }
-
- public long getArtistID() {
- return artistID;
- }
-
- public FastIDSet getGenreIDs() {
- return genreIDs;
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/TrackItemSimilarity.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/TrackItemSimilarity.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/TrackItemSimilarity.java
deleted file mode 100644
index 3012a84..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/TrackItemSimilarity.java
+++ /dev/null
@@ -1,106 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.kddcup.track2;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.Collection;
-
-import org.apache.mahout.cf.taste.common.Refreshable;
-import org.apache.mahout.cf.taste.example.kddcup.KDDCupDataModel;
-import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
-import org.apache.mahout.cf.taste.impl.common.FastIDSet;
-import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
-import org.apache.mahout.cf.taste.similarity.ItemSimilarity;
-import org.apache.mahout.common.iterator.FileLineIterable;
-
-final class TrackItemSimilarity implements ItemSimilarity {
-
- private final FastByIDMap<TrackData> trackData;
-
- TrackItemSimilarity(File dataFileDirectory) throws IOException {
- trackData = new FastByIDMap<>();
- for (String line : new FileLineIterable(KDDCupDataModel.getTrackFile(dataFileDirectory))) {
- TrackData trackDatum = new TrackData(line);
- trackData.put(trackDatum.getTrackID(), trackDatum);
- }
- }
-
- @Override
- public double itemSimilarity(long itemID1, long itemID2) {
- if (itemID1 == itemID2) {
- return 1.0;
- }
- TrackData data1 = trackData.get(itemID1);
- TrackData data2 = trackData.get(itemID2);
- if (data1 == null || data2 == null) {
- return 0.0;
- }
-
- // Arbitrarily decide that same album means "very similar"
- if (data1.getAlbumID() != TrackData.NO_VALUE_ID && data1.getAlbumID() == data2.getAlbumID()) {
- return 0.9;
- }
- // ... and same artist means "fairly similar"
- if (data1.getArtistID() != TrackData.NO_VALUE_ID && data1.getArtistID() == data2.getArtistID()) {
- return 0.7;
- }
-
- // Tanimoto coefficient similarity based on genre, but maximum value of 0.25
- FastIDSet genres1 = data1.getGenreIDs();
- FastIDSet genres2 = data2.getGenreIDs();
- if (genres1 == null || genres2 == null) {
- return 0.0;
- }
- int intersectionSize = genres1.intersectionSize(genres2);
- if (intersectionSize == 0) {
- return 0.0;
- }
- int unionSize = genres1.size() + genres2.size() - intersectionSize;
- return intersectionSize / (4.0 * unionSize);
- }
-
- @Override
- public double[] itemSimilarities(long itemID1, long[] itemID2s) {
- int length = itemID2s.length;
- double[] result = new double[length];
- for (int i = 0; i < length; i++) {
- result[i] = itemSimilarity(itemID1, itemID2s[i]);
- }
- return result;
- }
-
- @Override
- public long[] allSimilarItemIDs(long itemID) {
- FastIDSet allSimilarItemIDs = new FastIDSet();
- LongPrimitiveIterator allItemIDs = trackData.keySetIterator();
- while (allItemIDs.hasNext()) {
- long possiblySimilarItemID = allItemIDs.nextLong();
- if (!Double.isNaN(itemSimilarity(itemID, possiblySimilarItemID))) {
- allSimilarItemIDs.add(possiblySimilarItemID);
- }
- }
- return allSimilarItemIDs.toArray();
- }
-
- @Override
- public void refresh(Collection<Refreshable> alreadyRefreshed) {
- // do nothing
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/UserResult.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/UserResult.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/UserResult.java
deleted file mode 100644
index e554d10..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track2/UserResult.java
+++ /dev/null
@@ -1,54 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.kddcup.track2;
-
-final class UserResult {
-
- private final long userID;
- private final byte[] resultBytes;
-
- UserResult(long userID, boolean[] result) {
-
- this.userID = userID;
-
- int trueCount = 0;
- for (boolean b : result) {
- if (b) {
- trueCount++;
- }
- }
- if (trueCount != 3) {
- throw new IllegalStateException();
- }
-
- resultBytes = new byte[result.length];
- for (int i = 0; i < result.length; i++) {
- resultBytes[i] = (byte) (result[i] ? '1' : '0');
- }
- }
-
- public long getUserID() {
- return userID;
- }
-
- public byte[] getResultBytes() {
- return resultBytes;
- }
-
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/hadoop/example/als/netflix/NetflixDatasetConverter.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/hadoop/example/als/netflix/NetflixDatasetConverter.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/hadoop/example/als/netflix/NetflixDatasetConverter.java
deleted file mode 100644
index 22f122e..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/hadoop/example/als/netflix/NetflixDatasetConverter.java
+++ /dev/null
@@ -1,140 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.hadoop.example.als.netflix;
-
-import com.google.common.base.Preconditions;
-import org.apache.commons.io.Charsets;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.mahout.cf.taste.impl.model.GenericPreference;
-import org.apache.mahout.cf.taste.model.Preference;
-import org.apache.mahout.common.iterator.FileLineIterable;
-import org.apache.mahout.common.iterator.FileLineIterator;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.BufferedWriter;
-import java.io.File;
-import java.io.IOException;
-import java.io.OutputStreamWriter;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.regex.Pattern;
-
-/** converts the raw files provided by netflix to an appropriate input format */
-public final class NetflixDatasetConverter {
-
- private static final Logger log = LoggerFactory.getLogger(NetflixDatasetConverter.class);
-
- private static final Pattern SEPARATOR = Pattern.compile(",");
- private static final String MOVIE_DENOTER = ":";
- private static final String TAB = "\t";
- private static final String NEWLINE = "\n";
-
- private NetflixDatasetConverter() {
- }
-
- public static void main(String[] args) throws IOException {
-
- if (args.length != 4) {
- System.err.println("Usage: NetflixDatasetConverter /path/to/training_set/ /path/to/qualifying.txt "
- + "/path/to/judging.txt /path/to/destination");
- return;
- }
-
- String trainingDataDir = args[0];
- String qualifyingTxt = args[1];
- String judgingTxt = args[2];
- Path outputPath = new Path(args[3]);
-
- Configuration conf = new Configuration();
- FileSystem fs = FileSystem.get(outputPath.toUri(), conf);
-
- Preconditions.checkArgument(trainingDataDir != null, "Training Data location needs to be specified");
- log.info("Creating training set at {}/trainingSet/ratings.tsv ...", outputPath);
- try (BufferedWriter writer =
- new BufferedWriter(
- new OutputStreamWriter(
- fs.create(new Path(outputPath, "trainingSet/ratings.tsv")), Charsets.UTF_8))){
-
- int ratingsProcessed = 0;
- for (File movieRatings : new File(trainingDataDir).listFiles()) {
- try (FileLineIterator lines = new FileLineIterator(movieRatings)) {
- boolean firstLineRead = false;
- String movieID = null;
- while (lines.hasNext()) {
- String line = lines.next();
- if (firstLineRead) {
- String[] tokens = SEPARATOR.split(line);
- String userID = tokens[0];
- String rating = tokens[1];
- writer.write(userID + TAB + movieID + TAB + rating + NEWLINE);
- ratingsProcessed++;
- if (ratingsProcessed % 1000000 == 0) {
- log.info("{} ratings processed...", ratingsProcessed);
- }
- } else {
- movieID = line.replaceAll(MOVIE_DENOTER, "");
- firstLineRead = true;
- }
- }
- }
-
- }
- log.info("{} ratings processed. done.", ratingsProcessed);
- }
-
- log.info("Reading probes...");
- List<Preference> probes = new ArrayList<>(2817131);
- long currentMovieID = -1;
- for (String line : new FileLineIterable(new File(qualifyingTxt))) {
- if (line.contains(MOVIE_DENOTER)) {
- currentMovieID = Long.parseLong(line.replaceAll(MOVIE_DENOTER, ""));
- } else {
- long userID = Long.parseLong(SEPARATOR.split(line)[0]);
- probes.add(new GenericPreference(userID, currentMovieID, 0));
- }
- }
- log.info("{} probes read...", probes.size());
-
- log.info("Reading ratings, creating probe set at {}/probeSet/ratings.tsv ...", outputPath);
- try (BufferedWriter writer =
- new BufferedWriter(new OutputStreamWriter(
- fs.create(new Path(outputPath, "probeSet/ratings.tsv")), Charsets.UTF_8))){
- int ratingsProcessed = 0;
- for (String line : new FileLineIterable(new File(judgingTxt))) {
- if (line.contains(MOVIE_DENOTER)) {
- currentMovieID = Long.parseLong(line.replaceAll(MOVIE_DENOTER, ""));
- } else {
- float rating = Float.parseFloat(SEPARATOR.split(line)[0]);
- Preference pref = probes.get(ratingsProcessed);
- Preconditions.checkState(pref.getItemID() == currentMovieID);
- ratingsProcessed++;
- writer.write(pref.getUserID() + TAB + pref.getItemID() + TAB + rating + NEWLINE);
- if (ratingsProcessed % 1000000 == 0) {
- log.info("{} ratings processed...", ratingsProcessed);
- }
- }
- }
- log.info("{} ratings processed. done.", ratingsProcessed);
- }
- }
-
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/similarity/precompute/example/BatchItemSimilaritiesGroupLens.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/similarity/precompute/example/BatchItemSimilaritiesGroupLens.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/similarity/precompute/example/BatchItemSimilaritiesGroupLens.java
deleted file mode 100644
index 8021d00..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/similarity/precompute/example/BatchItemSimilaritiesGroupLens.java
+++ /dev/null
@@ -1,65 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.similarity.precompute.example;
-
-import org.apache.mahout.cf.taste.impl.recommender.GenericItemBasedRecommender;
-import org.apache.mahout.cf.taste.impl.similarity.LogLikelihoodSimilarity;
-import org.apache.mahout.cf.taste.impl.similarity.precompute.FileSimilarItemsWriter;
-import org.apache.mahout.cf.taste.impl.similarity.precompute.MultithreadedBatchItemSimilarities;
-import org.apache.mahout.cf.taste.model.DataModel;
-import org.apache.mahout.cf.taste.recommender.ItemBasedRecommender;
-import org.apache.mahout.cf.taste.similarity.precompute.BatchItemSimilarities;
-
-import java.io.File;
-
-/**
- * Example that precomputes all item similarities of the Movielens1M dataset
- *
- * Usage: download movielens1M from http://www.grouplens.org/node/73 , unzip it and invoke this code with the path
- * to the ratings.dat file as argument
- *
- */
-public final class BatchItemSimilaritiesGroupLens {
-
- private BatchItemSimilaritiesGroupLens() {}
-
- public static void main(String[] args) throws Exception {
-
- if (args.length != 1) {
- System.err.println("Need path to ratings.dat of the movielens1M dataset as argument!");
- System.exit(-1);
- }
-
- File resultFile = new File(System.getProperty("java.io.tmpdir"), "similarities.csv");
- if (resultFile.exists()) {
- resultFile.delete();
- }
-
- DataModel dataModel = new GroupLensDataModel(new File(args[0]));
- ItemBasedRecommender recommender = new GenericItemBasedRecommender(dataModel,
- new LogLikelihoodSimilarity(dataModel));
- BatchItemSimilarities batch = new MultithreadedBatchItemSimilarities(recommender, 5);
-
- int numSimilarities = batch.computeItemSimilarities(Runtime.getRuntime().availableProcessors(), 1,
- new FileSimilarItemsWriter(resultFile));
-
- System.out.println("Computed " + numSimilarities + " similarities for " + dataModel.getNumItems() + " items "
- + "and saved them to " + resultFile.getAbsolutePath());
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/similarity/precompute/example/GroupLensDataModel.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/similarity/precompute/example/GroupLensDataModel.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/similarity/precompute/example/GroupLensDataModel.java
deleted file mode 100644
index 7ee9b17..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/similarity/precompute/example/GroupLensDataModel.java
+++ /dev/null
@@ -1,96 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.similarity.precompute.example;
-
-import com.google.common.io.Files;
-import com.google.common.io.InputSupplier;
-import com.google.common.io.Resources;
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStreamWriter;
-import java.io.Writer;
-import java.net.URL;
-import java.util.regex.Pattern;
-import org.apache.commons.io.Charsets;
-import org.apache.mahout.cf.taste.impl.model.file.FileDataModel;
-import org.apache.mahout.common.iterator.FileLineIterable;
-
-public final class GroupLensDataModel extends FileDataModel {
-
- private static final String COLON_DELIMTER = "::";
- private static final Pattern COLON_DELIMITER_PATTERN = Pattern.compile(COLON_DELIMTER);
-
- public GroupLensDataModel() throws IOException {
- this(readResourceToTempFile("/org/apache/mahout/cf/taste/example/grouplens/ratings.dat"));
- }
-
- /**
- * @param ratingsFile GroupLens ratings.dat file in its native format
- * @throws IOException if an error occurs while reading or writing files
- */
- public GroupLensDataModel(File ratingsFile) throws IOException {
- super(convertGLFile(ratingsFile));
- }
-
- private static File convertGLFile(File originalFile) throws IOException {
- // Now translate the file; remove commas, then convert "::" delimiter to comma
- File resultFile = new File(new File(System.getProperty("java.io.tmpdir")), "ratings.txt");
- if (resultFile.exists()) {
- resultFile.delete();
- }
- try (Writer writer = new OutputStreamWriter(new FileOutputStream(resultFile), Charsets.UTF_8)){
- for (String line : new FileLineIterable(originalFile, false)) {
- int lastDelimiterStart = line.lastIndexOf(COLON_DELIMTER);
- if (lastDelimiterStart < 0) {
- throw new IOException("Unexpected input format on line: " + line);
- }
- String subLine = line.substring(0, lastDelimiterStart);
- String convertedLine = COLON_DELIMITER_PATTERN.matcher(subLine).replaceAll(",");
- writer.write(convertedLine);
- writer.write('\n');
- }
- } catch (IOException ioe) {
- resultFile.delete();
- throw ioe;
- }
- return resultFile;
- }
-
- public static File readResourceToTempFile(String resourceName) throws IOException {
- InputSupplier<? extends InputStream> inSupplier;
- try {
- URL resourceURL = Resources.getResource(GroupLensDataModel.class, resourceName);
- inSupplier = Resources.newInputStreamSupplier(resourceURL);
- } catch (IllegalArgumentException iae) {
- File resourceFile = new File("src/main/java" + resourceName);
- inSupplier = Files.newInputStreamSupplier(resourceFile);
- }
- File tempFile = File.createTempFile("taste", null);
- tempFile.deleteOnExit();
- Files.copy(inSupplier, tempFile);
- return tempFile;
- }
-
- @Override
- public String toString() {
- return "GroupLensDataModel";
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/NewsgroupHelper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/NewsgroupHelper.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/NewsgroupHelper.java
deleted file mode 100644
index 5cec51c..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/NewsgroupHelper.java
+++ /dev/null
@@ -1,128 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.classifier;
-
-import com.google.common.collect.ConcurrentHashMultiset;
-import com.google.common.collect.Multiset;
-import com.google.common.io.Closeables;
-import com.google.common.io.Files;
-import org.apache.commons.io.Charsets;
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.standard.StandardAnalyzer;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.mahout.common.RandomUtils;
-import org.apache.mahout.math.RandomAccessSparseVector;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.vectorizer.encoders.ConstantValueEncoder;
-import org.apache.mahout.vectorizer.encoders.FeatureVectorEncoder;
-import org.apache.mahout.vectorizer.encoders.StaticWordValueEncoder;
-
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.IOException;
-import java.io.Reader;
-import java.io.StringReader;
-import java.text.SimpleDateFormat;
-import java.util.Collection;
-import java.util.Date;
-import java.util.Locale;
-import java.util.Random;
-
-public final class NewsgroupHelper {
-
- private static final SimpleDateFormat[] DATE_FORMATS = {
- new SimpleDateFormat("", Locale.ENGLISH),
- new SimpleDateFormat("MMM-yyyy", Locale.ENGLISH),
- new SimpleDateFormat("dd-MMM-yyyy HH:mm:ss", Locale.ENGLISH)
- };
-
- public static final int FEATURES = 10000;
- // 1997-01-15 00:01:00 GMT
- private static final long DATE_REFERENCE = 853286460;
- private static final long MONTH = 30 * 24 * 3600;
- private static final long WEEK = 7 * 24 * 3600;
-
- private final Random rand = RandomUtils.getRandom();
- private final Analyzer analyzer = new StandardAnalyzer();
- private final FeatureVectorEncoder encoder = new StaticWordValueEncoder("body");
- private final FeatureVectorEncoder bias = new ConstantValueEncoder("Intercept");
-
- public FeatureVectorEncoder getEncoder() {
- return encoder;
- }
-
- public FeatureVectorEncoder getBias() {
- return bias;
- }
-
- public Random getRandom() {
- return rand;
- }
-
- public Vector encodeFeatureVector(File file, int actual, int leakType, Multiset<String> overallCounts)
- throws IOException {
- long date = (long) (1000 * (DATE_REFERENCE + actual * MONTH + 1 * WEEK * rand.nextDouble()));
- Multiset<String> words = ConcurrentHashMultiset.create();
-
- try (BufferedReader reader = Files.newReader(file, Charsets.UTF_8)) {
- String line = reader.readLine();
- Reader dateString = new StringReader(DATE_FORMATS[leakType % 3].format(new Date(date)));
- countWords(analyzer, words, dateString, overallCounts);
- while (line != null && !line.isEmpty()) {
- boolean countHeader = (
- line.startsWith("From:") || line.startsWith("Subject:")
- || line.startsWith("Keywords:") || line.startsWith("Summary:")) && leakType < 6;
- do {
- Reader in = new StringReader(line);
- if (countHeader) {
- countWords(analyzer, words, in, overallCounts);
- }
- line = reader.readLine();
- } while (line != null && line.startsWith(" "));
- }
- if (leakType < 3) {
- countWords(analyzer, words, reader, overallCounts);
- }
- }
-
- Vector v = new RandomAccessSparseVector(FEATURES);
- bias.addToVector("", 1, v);
- for (String word : words.elementSet()) {
- encoder.addToVector(word, Math.log1p(words.count(word)), v);
- }
-
- return v;
- }
-
- public static void countWords(Analyzer analyzer,
- Collection<String> words,
- Reader in,
- Multiset<String> overallCounts) throws IOException {
- TokenStream ts = analyzer.tokenStream("text", in);
- ts.addAttribute(CharTermAttribute.class);
- ts.reset();
- while (ts.incrementToken()) {
- String s = ts.getAttribute(CharTermAttribute.class).toString();
- words.add(s);
- }
- overallCounts.addAll(words);
- ts.end();
- Closeables.close(ts, true);
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/email/PrepEmailMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/email/PrepEmailMapper.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/email/PrepEmailMapper.java
deleted file mode 100644
index 16e9d80..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/email/PrepEmailMapper.java
+++ /dev/null
@@ -1,65 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.classifier.email;
-
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.WritableComparable;
-import org.apache.hadoop.mapreduce.Mapper;
-import org.apache.mahout.math.VectorWritable;
-
-import java.io.IOException;
-import java.util.Locale;
-import java.util.regex.Pattern;
-
-/**
- * Convert the labels created by the {@link org.apache.mahout.utils.email.MailProcessor} to one consumable
- * by the classifiers
- */
-public class PrepEmailMapper extends Mapper<WritableComparable<?>, VectorWritable, Text, VectorWritable> {
-
- private static final Pattern DASH_DOT = Pattern.compile("-|\\.");
- private static final Pattern SLASH = Pattern.compile("\\/");
-
- private boolean useListName = false; //if true, use the project name and the list name in label creation
- @Override
- protected void setup(Context context) throws IOException, InterruptedException {
- useListName = Boolean.parseBoolean(context.getConfiguration().get(PrepEmailVectorsDriver.USE_LIST_NAME));
- }
-
- @Override
- protected void map(WritableComparable<?> key, VectorWritable value, Context context)
- throws IOException, InterruptedException {
- String input = key.toString();
- ///Example: /cocoon.apache.org/dev/200307.gz/001401c3414f$8394e160$***@WRPO
- String[] splits = SLASH.split(input);
- //we need the first two splits;
- if (splits.length >= 3) {
- StringBuilder bldr = new StringBuilder();
- bldr.append(escape(splits[1]));
- if (useListName) {
- bldr.append('_').append(escape(splits[2]));
- }
- context.write(new Text(bldr.toString()), value);
- }
-
- }
-
- private static String escape(CharSequence value) {
- return DASH_DOT.matcher(value).replaceAll("_").toLowerCase(Locale.ENGLISH);
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/email/PrepEmailReducer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/email/PrepEmailReducer.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/email/PrepEmailReducer.java
deleted file mode 100644
index da6e613..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/email/PrepEmailReducer.java
+++ /dev/null
@@ -1,47 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.classifier.email;
-
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.Reducer;
-import org.apache.mahout.math.VectorWritable;
-
-import java.io.IOException;
-import java.util.Iterator;
-
-public class PrepEmailReducer extends Reducer<Text, VectorWritable, Text, VectorWritable> {
-
- private long maxItemsPerLabel = 10000;
-
- @Override
- protected void setup(Context context) throws IOException, InterruptedException {
- maxItemsPerLabel = Long.parseLong(context.getConfiguration().get(PrepEmailVectorsDriver.ITEMS_PER_CLASS));
- }
-
- @Override
- protected void reduce(Text key, Iterable<VectorWritable> values, Context context)
- throws IOException, InterruptedException {
- //TODO: support randomization? Likely not needed due to the SplitInput utility which does random selection
- long i = 0;
- Iterator<VectorWritable> iterator = values.iterator();
- while (i < maxItemsPerLabel && iterator.hasNext()) {
- context.write(key, iterator.next());
- i++;
- }
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/email/PrepEmailVectorsDriver.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/email/PrepEmailVectorsDriver.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/email/PrepEmailVectorsDriver.java
deleted file mode 100644
index 8fba739..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/email/PrepEmailVectorsDriver.java
+++ /dev/null
@@ -1,76 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.classifier.email;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.Job;
-import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
-import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
-import org.apache.hadoop.util.ToolRunner;
-import org.apache.mahout.common.AbstractJob;
-import org.apache.mahout.common.HadoopUtil;
-import org.apache.mahout.common.commandline.DefaultOptionCreator;
-import org.apache.mahout.math.VectorWritable;
-
-import java.util.List;
-import java.util.Map;
-
-/**
- * Convert the labels generated by {@link org.apache.mahout.text.SequenceFilesFromMailArchives} and
- * {@link org.apache.mahout.vectorizer.SparseVectorsFromSequenceFiles} to ones consumable by the classifiers. We do this
- * here b/c if it is done in the creation of sparse vectors, the Reducer collapses all the vectors.
- */
-public class PrepEmailVectorsDriver extends AbstractJob {
-
- public static final String ITEMS_PER_CLASS = "itemsPerClass";
- public static final String USE_LIST_NAME = "USE_LIST_NAME";
-
- public static void main(String[] args) throws Exception {
- ToolRunner.run(new Configuration(), new PrepEmailVectorsDriver(), args);
- }
-
- @Override
- public int run(String[] args) throws Exception {
- addInputOption();
- addOutputOption();
- addOption(DefaultOptionCreator.overwriteOption().create());
- addOption("maxItemsPerLabel", "mipl", "The maximum number of items per label. Can be useful for making the "
- + "training sets the same size", String.valueOf(100000));
- addOption(buildOption("useListName", "ul", "Use the name of the list as part of the label. If not set, then "
- + "just use the project name", false, false, "false"));
- Map<String,List<String>> parsedArgs = parseArguments(args);
- if (parsedArgs == null) {
- return -1;
- }
-
- Path input = getInputPath();
- Path output = getOutputPath();
- if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
- HadoopUtil.delete(getConf(), output);
- }
- Job convertJob = prepareJob(input, output, SequenceFileInputFormat.class, PrepEmailMapper.class, Text.class,
- VectorWritable.class, PrepEmailReducer.class, Text.class, VectorWritable.class, SequenceFileOutputFormat.class);
- convertJob.getConfiguration().set(ITEMS_PER_CLASS, getOption("maxItemsPerLabel"));
- convertJob.getConfiguration().set(USE_LIST_NAME, String.valueOf(hasOption("useListName")));
-
- boolean succeeded = convertJob.waitForCompletion(true);
- return succeeded ? 0 : -1;
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/PosTagger.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/PosTagger.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/PosTagger.java
deleted file mode 100644
index 9c0ef56..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/PosTagger.java
+++ /dev/null
@@ -1,277 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.classifier.sequencelearning.hmm;
-
-import com.google.common.io.Resources;
-import org.apache.commons.io.Charsets;
-import org.apache.mahout.math.Matrix;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.IOException;
-import java.net.URL;
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.LinkedList;
-import java.util.List;
-import java.util.Map;
-import java.util.regex.Pattern;
-
-/**
- * This class implements a sample program that uses a pre-tagged training data
- * set to train an HMM model as a POS tagger. The training data is automatically
- * downloaded from the following URL:
- * http://www.jaist.ac.jp/~hieuxuan/flexcrfs/CoNLL2000-NP/train.txt It then
- * trains an HMM Model using supervised learning and tests the model on the
- * following test data set:
- * http://www.jaist.ac.jp/~hieuxuan/flexcrfs/CoNLL2000-NP/test.txt Further
- * details regarding the data files can be found at
- * http://flexcrfs.sourceforge.net/#Case_Study
- */
-public final class PosTagger {
-
- private static final Logger log = LoggerFactory.getLogger(PosTagger.class);
-
- private static final Pattern SPACE = Pattern.compile(" ");
- private static final Pattern SPACES = Pattern.compile("[ ]+");
-
- /**
- * No public constructors for utility classes.
- */
- private PosTagger() {
- // nothing to do here really.
- }
-
- /**
- * Model trained in the example.
- */
- private static HmmModel taggingModel;
-
- /**
- * Map for storing the IDs for the POS tags (hidden states)
- */
- private static Map<String, Integer> tagIDs;
-
- /**
- * Counter for the next assigned POS tag ID The value of 0 is reserved for
- * "unknown POS tag"
- */
- private static int nextTagId;
-
- /**
- * Map for storing the IDs for observed words (observed states)
- */
- private static Map<String, Integer> wordIDs;
-
- /**
- * Counter for the next assigned word ID The value of 0 is reserved for
- * "unknown word"
- */
- private static int nextWordId = 1; // 0 is reserved for "unknown word"
-
- /**
- * Used for storing a list of POS tags of read sentences.
- */
- private static List<int[]> hiddenSequences;
-
- /**
- * Used for storing a list of word tags of read sentences.
- */
- private static List<int[]> observedSequences;
-
- /**
- * number of read lines
- */
- private static int readLines;
-
- /**
- * Given an URL, this function fetches the data file, parses it, assigns POS
- * Tag/word IDs and fills the hiddenSequences/observedSequences lists with
- * data from those files. The data is expected to be in the following format
- * (one word per line): word pos-tag np-tag sentences are closed with the .
- * pos tag
- *
- * @param url Where the data file is stored
- * @param assignIDs Should IDs for unknown words/tags be assigned? (Needed for
- * training data, not needed for test data)
- * @throws IOException in case data file cannot be read.
- */
- private static void readFromURL(String url, boolean assignIDs) throws IOException {
- // initialize the data structure
- hiddenSequences = new LinkedList<>();
- observedSequences = new LinkedList<>();
- readLines = 0;
-
- // now read line by line of the input file
- List<Integer> observedSequence = new LinkedList<>();
- List<Integer> hiddenSequence = new LinkedList<>();
-
- for (String line :Resources.readLines(new URL(url), Charsets.UTF_8)) {
- if (line.isEmpty()) {
- // new sentence starts
- int[] observedSequenceArray = new int[observedSequence.size()];
- int[] hiddenSequenceArray = new int[hiddenSequence.size()];
- for (int i = 0; i < observedSequence.size(); ++i) {
- observedSequenceArray[i] = observedSequence.get(i);
- hiddenSequenceArray[i] = hiddenSequence.get(i);
- }
- // now register those arrays
- hiddenSequences.add(hiddenSequenceArray);
- observedSequences.add(observedSequenceArray);
- // and reset the linked lists
- observedSequence.clear();
- hiddenSequence.clear();
- continue;
- }
- readLines++;
- // we expect the format [word] [POS tag] [NP tag]
- String[] tags = SPACE.split(line);
- // when analyzing the training set, assign IDs
- if (assignIDs) {
- if (!wordIDs.containsKey(tags[0])) {
- wordIDs.put(tags[0], nextWordId++);
- }
- if (!tagIDs.containsKey(tags[1])) {
- tagIDs.put(tags[1], nextTagId++);
- }
- }
- // determine the IDs
- Integer wordID = wordIDs.get(tags[0]);
- Integer tagID = tagIDs.get(tags[1]);
- // now construct the current sequence
- if (wordID == null) {
- observedSequence.add(0);
- } else {
- observedSequence.add(wordID);
- }
-
- if (tagID == null) {
- hiddenSequence.add(0);
- } else {
- hiddenSequence.add(tagID);
- }
- }
-
- // if there is still something in the pipe, register it
- if (!observedSequence.isEmpty()) {
- int[] observedSequenceArray = new int[observedSequence.size()];
- int[] hiddenSequenceArray = new int[hiddenSequence.size()];
- for (int i = 0; i < observedSequence.size(); ++i) {
- observedSequenceArray[i] = observedSequence.get(i);
- hiddenSequenceArray[i] = hiddenSequence.get(i);
- }
- // now register those arrays
- hiddenSequences.add(hiddenSequenceArray);
- observedSequences.add(observedSequenceArray);
- }
- }
-
- private static void trainModel(String trainingURL) throws IOException {
- tagIDs = new HashMap<>(44); // we expect 44 distinct tags
- wordIDs = new HashMap<>(19122); // we expect 19122
- // distinct words
- log.info("Reading and parsing training data file from URL: {}", trainingURL);
- long start = System.currentTimeMillis();
- readFromURL(trainingURL, true);
- long end = System.currentTimeMillis();
- double duration = (end - start) / 1000.0;
- log.info("Parsing done in {} seconds!", duration);
- log.info("Read {} lines containing {} sentences with a total of {} distinct words and {} distinct POS tags.",
- readLines, hiddenSequences.size(), nextWordId - 1, nextTagId - 1);
- start = System.currentTimeMillis();
- taggingModel = HmmTrainer.trainSupervisedSequence(nextTagId, nextWordId,
- hiddenSequences, observedSequences, 0.05);
- // we have to adjust the model a bit,
- // since we assume a higher probability that a given unknown word is NNP
- // than anything else
- Matrix emissions = taggingModel.getEmissionMatrix();
- for (int i = 0; i < taggingModel.getNrOfHiddenStates(); ++i) {
- emissions.setQuick(i, 0, 0.1 / taggingModel.getNrOfHiddenStates());
- }
- int nnptag = tagIDs.get("NNP");
- emissions.setQuick(nnptag, 0, 1 / (double) taggingModel.getNrOfHiddenStates());
- // re-normalize the emission probabilities
- HmmUtils.normalizeModel(taggingModel);
- // now register the names
- taggingModel.registerHiddenStateNames(tagIDs);
- taggingModel.registerOutputStateNames(wordIDs);
- end = System.currentTimeMillis();
- duration = (end - start) / 1000.0;
- log.info("Trained HMM models in {} seconds!", duration);
- }
-
- private static void testModel(String testingURL) throws IOException {
- log.info("Reading and parsing test data file from URL: {}", testingURL);
- long start = System.currentTimeMillis();
- readFromURL(testingURL, false);
- long end = System.currentTimeMillis();
- double duration = (end - start) / 1000.0;
- log.info("Parsing done in {} seconds!", duration);
- log.info("Read {} lines containing {} sentences.", readLines, hiddenSequences.size());
-
- start = System.currentTimeMillis();
- int errorCount = 0;
- int totalCount = 0;
- for (int i = 0; i < observedSequences.size(); ++i) {
- // fetch the viterbi path as the POS tag for this observed sequence
- int[] posEstimate = HmmEvaluator.decode(taggingModel, observedSequences.get(i), false);
- // compare with the expected
- int[] posExpected = hiddenSequences.get(i);
- for (int j = 0; j < posExpected.length; ++j) {
- totalCount++;
- if (posEstimate[j] != posExpected[j]) {
- errorCount++;
- }
- }
- }
- end = System.currentTimeMillis();
- duration = (end - start) / 1000.0;
- log.info("POS tagged test file in {} seconds!", duration);
- double errorRate = (double) errorCount / totalCount;
- log.info("Tagged the test file with an error rate of: {}", errorRate);
- }
-
- private static List<String> tagSentence(String sentence) {
- // first, we need to isolate all punctuation characters, so that they
- // can be recognized
- sentence = sentence.replaceAll("[,.!?:;\"]", " $0 ");
- sentence = sentence.replaceAll("''", " '' ");
- // now we tokenize the sentence
- String[] tokens = SPACES.split(sentence);
- // now generate the observed sequence
- int[] observedSequence = HmmUtils.encodeStateSequence(taggingModel, Arrays.asList(tokens), true, 0);
- // POS tag this observedSequence
- int[] hiddenSequence = HmmEvaluator.decode(taggingModel, observedSequence, false);
- // and now decode the tag names
- return HmmUtils.decodeStateSequence(taggingModel, hiddenSequence, false, null);
- }
-
- public static void main(String[] args) throws IOException {
- // generate the model from URL
- trainModel("http://www.jaist.ac.jp/~hieuxuan/flexcrfs/CoNLL2000-NP/train.txt");
- testModel("http://www.jaist.ac.jp/~hieuxuan/flexcrfs/CoNLL2000-NP/test.txt");
- // tag an exemplary sentence
- String test = "McDonalds is a huge company with many employees .";
- String[] testWords = SPACE.split(test);
- List<String> posTags = tagSentence(test);
- for (int i = 0; i < posTags.size(); ++i) {
- log.info("{}[{}]", testWords[i], posTags.get(i));
- }
- }
-
-}
r***@apache.org
2018-06-28 14:55:10 UTC
Permalink
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/resources/bank-full.csv
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/resources/bank-full.csv b/community/mahout-mr/examples/src/main/resources/bank-full.csv
deleted file mode 100644
index d7a2ede..0000000
--- a/community/mahout-mr/examples/src/main/resources/bank-full.csv
+++ /dev/null
@@ -1,45212 +0,0 @@
-"age";"job";"marital";"education";"default";"balance";"housing";"loan";"contact";"day";"month";"duration";"campaign";"pdays";"previous";"poutcome";"y"
-58;"management";"married";"tertiary";"no";2143;"yes";"no";"unknown";5;"may";261;1;-1;0;"unknown";"no"
-44;"technician";"single";"secondary";"no";29;"yes";"no";"unknown";5;"may";151;1;-1;0;"unknown";"no"
-33;"entrepreneur";"married";"secondary";"no";2;"yes";"yes";"unknown";5;"may";76;1;-1;0;"unknown";"no"
-47;"blue-collar";"married";"unknown";"no";1506;"yes";"no";"unknown";5;"may";92;1;-1;0;"unknown";"no"
-33;"unknown";"single";"unknown";"no";1;"no";"no";"unknown";5;"may";198;1;-1;0;"unknown";"no"
-35;"management";"married";"tertiary";"no";231;"yes";"no";"unknown";5;"may";139;1;-1;0;"unknown";"no"
-28;"management";"single";"tertiary";"no";447;"yes";"yes";"unknown";5;"may";217;1;-1;0;"unknown";"no"
-42;"entrepreneur";"divorced";"tertiary";"yes";2;"yes";"no";"unknown";5;"may";380;1;-1;0;"unknown";"no"
-58;"retired";"married";"primary";"no";121;"yes";"no";"unknown";5;"may";50;1;-1;0;"unknown";"no"
-43;"technician";"single";"secondary";"no";593;"yes";"no";"unknown";5;"may";55;1;-1;0;"unknown";"no"
-41;"admin.";"divorced";"secondary";"no";270;"yes";"no";"unknown";5;"may";222;1;-1;0;"unknown";"no"
-29;"admin.";"single";"secondary";"no";390;"yes";"no";"unknown";5;"may";137;1;-1;0;"unknown";"no"
-53;"technician";"married";"secondary";"no";6;"yes";"no";"unknown";5;"may";517;1;-1;0;"unknown";"no"
-58;"technician";"married";"unknown";"no";71;"yes";"no";"unknown";5;"may";71;1;-1;0;"unknown";"no"
-57;"services";"married";"secondary";"no";162;"yes";"no";"unknown";5;"may";174;1;-1;0;"unknown";"no"
-51;"retired";"married";"primary";"no";229;"yes";"no";"unknown";5;"may";353;1;-1;0;"unknown";"no"
-45;"admin.";"single";"unknown";"no";13;"yes";"no";"unknown";5;"may";98;1;-1;0;"unknown";"no"
-57;"blue-collar";"married";"primary";"no";52;"yes";"no";"unknown";5;"may";38;1;-1;0;"unknown";"no"
-60;"retired";"married";"primary";"no";60;"yes";"no";"unknown";5;"may";219;1;-1;0;"unknown";"no"
-33;"services";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";54;1;-1;0;"unknown";"no"
-28;"blue-collar";"married";"secondary";"no";723;"yes";"yes";"unknown";5;"may";262;1;-1;0;"unknown";"no"
-56;"management";"married";"tertiary";"no";779;"yes";"no";"unknown";5;"may";164;1;-1;0;"unknown";"no"
-32;"blue-collar";"single";"primary";"no";23;"yes";"yes";"unknown";5;"may";160;1;-1;0;"unknown";"no"
-25;"services";"married";"secondary";"no";50;"yes";"no";"unknown";5;"may";342;1;-1;0;"unknown";"no"
-40;"retired";"married";"primary";"no";0;"yes";"yes";"unknown";5;"may";181;1;-1;0;"unknown";"no"
-44;"admin.";"married";"secondary";"no";-372;"yes";"no";"unknown";5;"may";172;1;-1;0;"unknown";"no"
-39;"management";"single";"tertiary";"no";255;"yes";"no";"unknown";5;"may";296;1;-1;0;"unknown";"no"
-52;"entrepreneur";"married";"secondary";"no";113;"yes";"yes";"unknown";5;"may";127;1;-1;0;"unknown";"no"
-46;"management";"single";"secondary";"no";-246;"yes";"no";"unknown";5;"may";255;2;-1;0;"unknown";"no"
-36;"technician";"single";"secondary";"no";265;"yes";"yes";"unknown";5;"may";348;1;-1;0;"unknown";"no"
-57;"technician";"married";"secondary";"no";839;"no";"yes";"unknown";5;"may";225;1;-1;0;"unknown";"no"
-49;"management";"married";"tertiary";"no";378;"yes";"no";"unknown";5;"may";230;1;-1;0;"unknown";"no"
-60;"admin.";"married";"secondary";"no";39;"yes";"yes";"unknown";5;"may";208;1;-1;0;"unknown";"no"
-59;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";226;1;-1;0;"unknown";"no"
-51;"management";"married";"tertiary";"no";10635;"yes";"no";"unknown";5;"may";336;1;-1;0;"unknown";"no"
-57;"technician";"divorced";"secondary";"no";63;"yes";"no";"unknown";5;"may";242;1;-1;0;"unknown";"no"
-25;"blue-collar";"married";"secondary";"no";-7;"yes";"no";"unknown";5;"may";365;1;-1;0;"unknown";"no"
-53;"technician";"married";"secondary";"no";-3;"no";"no";"unknown";5;"may";1666;1;-1;0;"unknown";"no"
-36;"admin.";"divorced";"secondary";"no";506;"yes";"no";"unknown";5;"may";577;1;-1;0;"unknown";"no"
-37;"admin.";"single";"secondary";"no";0;"yes";"no";"unknown";5;"may";137;1;-1;0;"unknown";"no"
-44;"services";"divorced";"secondary";"no";2586;"yes";"no";"unknown";5;"may";160;1;-1;0;"unknown";"no"
-50;"management";"married";"secondary";"no";49;"yes";"no";"unknown";5;"may";180;2;-1;0;"unknown";"no"
-60;"blue-collar";"married";"unknown";"no";104;"yes";"no";"unknown";5;"may";22;1;-1;0;"unknown";"no"
-54;"retired";"married";"secondary";"no";529;"yes";"no";"unknown";5;"may";1492;1;-1;0;"unknown";"no"
-58;"retired";"married";"unknown";"no";96;"yes";"no";"unknown";5;"may";616;1;-1;0;"unknown";"no"
-36;"admin.";"single";"primary";"no";-171;"yes";"no";"unknown";5;"may";242;1;-1;0;"unknown";"no"
-58;"self-employed";"married";"tertiary";"no";-364;"yes";"no";"unknown";5;"may";355;1;-1;0;"unknown";"no"
-44;"technician";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";225;2;-1;0;"unknown";"no"
-55;"technician";"divorced";"secondary";"no";0;"no";"no";"unknown";5;"may";160;1;-1;0;"unknown";"no"
-29;"management";"single";"tertiary";"no";0;"yes";"no";"unknown";5;"may";363;1;-1;0;"unknown";"no"
-54;"blue-collar";"married";"secondary";"no";1291;"yes";"no";"unknown";5;"may";266;1;-1;0;"unknown";"no"
-48;"management";"divorced";"tertiary";"no";-244;"yes";"no";"unknown";5;"may";253;1;-1;0;"unknown";"no"
-32;"management";"married";"tertiary";"no";0;"yes";"no";"unknown";5;"may";179;1;-1;0;"unknown";"no"
-42;"admin.";"single";"secondary";"no";-76;"yes";"no";"unknown";5;"may";787;1;-1;0;"unknown";"no"
-24;"technician";"single";"secondary";"no";-103;"yes";"yes";"unknown";5;"may";145;1;-1;0;"unknown";"no"
-38;"entrepreneur";"single";"tertiary";"no";243;"no";"yes";"unknown";5;"may";174;1;-1;0;"unknown";"no"
-38;"management";"single";"tertiary";"no";424;"yes";"no";"unknown";5;"may";104;1;-1;0;"unknown";"no"
-47;"blue-collar";"married";"unknown";"no";306;"yes";"no";"unknown";5;"may";13;1;-1;0;"unknown";"no"
-40;"blue-collar";"single";"unknown";"no";24;"yes";"no";"unknown";5;"may";185;1;-1;0;"unknown";"no"
-46;"services";"married";"primary";"no";179;"yes";"no";"unknown";5;"may";1778;1;-1;0;"unknown";"no"
-32;"admin.";"married";"tertiary";"no";0;"yes";"no";"unknown";5;"may";138;1;-1;0;"unknown";"no"
-53;"technician";"divorced";"secondary";"no";989;"yes";"no";"unknown";5;"may";812;1;-1;0;"unknown";"no"
-57;"blue-collar";"married";"primary";"no";249;"yes";"no";"unknown";5;"may";164;1;-1;0;"unknown";"no"
-33;"services";"married";"secondary";"no";790;"yes";"no";"unknown";5;"may";391;1;-1;0;"unknown";"no"
-49;"blue-collar";"married";"unknown";"no";154;"yes";"no";"unknown";5;"may";357;1;-1;0;"unknown";"no"
-51;"management";"married";"tertiary";"no";6530;"yes";"no";"unknown";5;"may";91;1;-1;0;"unknown";"no"
-60;"retired";"married";"tertiary";"no";100;"no";"no";"unknown";5;"may";528;1;-1;0;"unknown";"no"
-59;"management";"divorced";"tertiary";"no";59;"yes";"no";"unknown";5;"may";273;1;-1;0;"unknown";"no"
-55;"technician";"married";"secondary";"no";1205;"yes";"no";"unknown";5;"may";158;2;-1;0;"unknown";"no"
-35;"blue-collar";"single";"secondary";"no";12223;"yes";"yes";"unknown";5;"may";177;1;-1;0;"unknown";"no"
-57;"blue-collar";"married";"secondary";"no";5935;"yes";"yes";"unknown";5;"may";258;1;-1;0;"unknown";"no"
-31;"services";"married";"secondary";"no";25;"yes";"yes";"unknown";5;"may";172;1;-1;0;"unknown";"no"
-54;"management";"married";"secondary";"no";282;"yes";"yes";"unknown";5;"may";154;1;-1;0;"unknown";"no"
-55;"blue-collar";"married";"primary";"no";23;"yes";"no";"unknown";5;"may";291;1;-1;0;"unknown";"no"
-43;"technician";"married";"secondary";"no";1937;"yes";"no";"unknown";5;"may";181;1;-1;0;"unknown";"no"
-53;"technician";"married";"secondary";"no";384;"yes";"no";"unknown";5;"may";176;1;-1;0;"unknown";"no"
-44;"blue-collar";"married";"secondary";"no";582;"no";"yes";"unknown";5;"may";211;1;-1;0;"unknown";"no"
-55;"services";"divorced";"secondary";"no";91;"no";"no";"unknown";5;"may";349;1;-1;0;"unknown";"no"
-49;"services";"divorced";"secondary";"no";0;"yes";"yes";"unknown";5;"may";272;1;-1;0;"unknown";"no"
-55;"services";"divorced";"secondary";"yes";1;"yes";"no";"unknown";5;"may";208;1;-1;0;"unknown";"no"
-45;"admin.";"single";"secondary";"no";206;"yes";"no";"unknown";5;"may";193;1;-1;0;"unknown";"no"
-47;"services";"divorced";"secondary";"no";164;"no";"no";"unknown";5;"may";212;1;-1;0;"unknown";"no"
-42;"technician";"single";"secondary";"no";690;"yes";"no";"unknown";5;"may";20;1;-1;0;"unknown";"no"
-59;"admin.";"married";"secondary";"no";2343;"yes";"no";"unknown";5;"may";1042;1;-1;0;"unknown";"yes"
-46;"self-employed";"married";"tertiary";"no";137;"yes";"yes";"unknown";5;"may";246;1;-1;0;"unknown";"no"
-51;"blue-collar";"married";"primary";"no";173;"yes";"no";"unknown";5;"may";529;2;-1;0;"unknown";"no"
-56;"admin.";"married";"secondary";"no";45;"no";"no";"unknown";5;"may";1467;1;-1;0;"unknown";"yes"
-41;"technician";"married";"secondary";"no";1270;"yes";"no";"unknown";5;"may";1389;1;-1;0;"unknown";"yes"
-46;"management";"divorced";"secondary";"no";16;"yes";"yes";"unknown";5;"may";188;2;-1;0;"unknown";"no"
-57;"retired";"married";"secondary";"no";486;"yes";"no";"unknown";5;"may";180;2;-1;0;"unknown";"no"
-42;"management";"single";"secondary";"no";50;"no";"no";"unknown";5;"may";48;1;-1;0;"unknown";"no"
-30;"technician";"married";"secondary";"no";152;"yes";"yes";"unknown";5;"may";213;2;-1;0;"unknown";"no"
-60;"admin.";"married";"secondary";"no";290;"yes";"no";"unknown";5;"may";583;1;-1;0;"unknown";"no"
-60;"blue-collar";"married";"unknown";"no";54;"yes";"no";"unknown";5;"may";221;1;-1;0;"unknown";"no"
-57;"entrepreneur";"divorced";"secondary";"no";-37;"no";"no";"unknown";5;"may";173;1;-1;0;"unknown";"no"
-36;"management";"married";"tertiary";"no";101;"yes";"yes";"unknown";5;"may";426;1;-1;0;"unknown";"no"
-55;"blue-collar";"married";"secondary";"no";383;"no";"no";"unknown";5;"may";287;1;-1;0;"unknown";"no"
-60;"retired";"married";"tertiary";"no";81;"yes";"no";"unknown";5;"may";101;1;-1;0;"unknown";"no"
-39;"technician";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";203;1;-1;0;"unknown";"no"
-46;"management";"married";"tertiary";"no";229;"yes";"no";"unknown";5;"may";197;1;-1;0;"unknown";"no"
-44;"blue-collar";"married";"secondary";"no";-674;"yes";"no";"unknown";5;"may";257;1;-1;0;"unknown";"no"
-53;"blue-collar";"married";"primary";"no";90;"no";"no";"unknown";5;"may";124;1;-1;0;"unknown";"no"
-52;"blue-collar";"married";"primary";"no";128;"yes";"no";"unknown";5;"may";229;1;-1;0;"unknown";"no"
-59;"blue-collar";"married";"primary";"no";179;"yes";"no";"unknown";5;"may";55;3;-1;0;"unknown";"no"
-27;"technician";"single";"tertiary";"no";0;"yes";"no";"unknown";5;"may";400;1;-1;0;"unknown";"no"
-44;"blue-collar";"married";"secondary";"no";54;"yes";"no";"unknown";5;"may";197;1;-1;0;"unknown";"no"
-47;"technician";"married";"tertiary";"no";151;"yes";"no";"unknown";5;"may";190;1;-1;0;"unknown";"no"
-34;"admin.";"married";"secondary";"no";61;"no";"yes";"unknown";5;"may";21;1;-1;0;"unknown";"no"
-59;"retired";"single";"secondary";"no";30;"yes";"no";"unknown";5;"may";514;1;-1;0;"unknown";"no"
-45;"management";"married";"tertiary";"no";523;"yes";"no";"unknown";5;"may";849;2;-1;0;"unknown";"no"
-29;"services";"divorced";"secondary";"no";31;"yes";"no";"unknown";5;"may";194;1;-1;0;"unknown";"no"
-46;"technician";"divorced";"secondary";"no";79;"no";"no";"unknown";5;"may";144;1;-1;0;"unknown";"no"
-56;"self-employed";"married";"primary";"no";-34;"yes";"yes";"unknown";5;"may";212;2;-1;0;"unknown";"no"
-36;"blue-collar";"married";"primary";"no";448;"yes";"no";"unknown";5;"may";286;1;-1;0;"unknown";"no"
-59;"retired";"divorced";"primary";"no";81;"yes";"no";"unknown";5;"may";107;1;-1;0;"unknown";"no"
-44;"blue-collar";"married";"secondary";"no";144;"yes";"no";"unknown";5;"may";247;2;-1;0;"unknown";"no"
-41;"admin.";"married";"secondary";"no";351;"yes";"no";"unknown";5;"may";518;1;-1;0;"unknown";"no"
-33;"management";"single";"tertiary";"no";-67;"yes";"no";"unknown";5;"may";364;1;-1;0;"unknown";"no"
-59;"management";"divorced";"tertiary";"no";262;"no";"no";"unknown";5;"may";178;1;-1;0;"unknown";"no"
-57;"technician";"married";"primary";"no";0;"no";"no";"unknown";5;"may";98;1;-1;0;"unknown";"no"
-56;"technician";"divorced";"unknown";"no";56;"yes";"no";"unknown";5;"may";439;1;-1;0;"unknown";"no"
-51;"blue-collar";"married";"secondary";"no";26;"yes";"no";"unknown";5;"may";79;1;-1;0;"unknown";"no"
-34;"admin.";"married";"unknown";"no";3;"yes";"no";"unknown";5;"may";120;3;-1;0;"unknown";"no"
-43;"services";"married";"secondary";"no";41;"yes";"yes";"unknown";5;"may";127;2;-1;0;"unknown";"no"
-52;"technician";"married";"tertiary";"no";7;"no";"yes";"unknown";5;"may";175;1;-1;0;"unknown";"no"
-33;"technician";"single";"secondary";"no";105;"yes";"no";"unknown";5;"may";262;2;-1;0;"unknown";"no"
-29;"admin.";"single";"secondary";"no";818;"yes";"yes";"unknown";5;"may";61;1;-1;0;"unknown";"no"
-34;"services";"married";"secondary";"no";-16;"yes";"yes";"unknown";5;"may";78;1;-1;0;"unknown";"no"
-31;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";143;1;-1;0;"unknown";"no"
-55;"services";"married";"secondary";"no";2476;"yes";"no";"unknown";5;"may";579;1;-1;0;"unknown";"yes"
-55;"management";"married";"unknown";"no";1185;"no";"no";"unknown";5;"may";677;1;-1;0;"unknown";"no"
-32;"admin.";"single";"secondary";"no";217;"yes";"no";"unknown";5;"may";345;1;-1;0;"unknown";"no"
-38;"technician";"single";"secondary";"no";1685;"yes";"no";"unknown";5;"may";185;1;-1;0;"unknown";"no"
-55;"admin.";"single";"secondary";"no";802;"yes";"yes";"unknown";5;"may";100;2;-1;0;"unknown";"no"
-28;"unemployed";"single";"tertiary";"no";0;"yes";"no";"unknown";5;"may";125;2;-1;0;"unknown";"no"
-23;"blue-collar";"married";"secondary";"no";94;"yes";"no";"unknown";5;"may";193;1;-1;0;"unknown";"no"
-32;"technician";"single";"secondary";"no";0;"yes";"no";"unknown";5;"may";136;1;-1;0;"unknown";"no"
-43;"services";"single";"unknown";"no";0;"no";"no";"unknown";5;"may";73;1;-1;0;"unknown";"no"
-32;"blue-collar";"married";"secondary";"no";517;"yes";"no";"unknown";5;"may";528;1;-1;0;"unknown";"no"
-46;"blue-collar";"married";"secondary";"no";265;"yes";"no";"unknown";5;"may";541;1;-1;0;"unknown";"no"
-53;"housemaid";"divorced";"primary";"no";947;"yes";"no";"unknown";5;"may";163;1;-1;0;"unknown";"no"
-34;"self-employed";"single";"secondary";"no";3;"yes";"no";"unknown";5;"may";301;1;-1;0;"unknown";"no"
-57;"unemployed";"married";"tertiary";"no";42;"no";"no";"unknown";5;"may";46;1;-1;0;"unknown";"no"
-37;"blue-collar";"married";"secondary";"no";37;"yes";"no";"unknown";5;"may";204;1;-1;0;"unknown";"no"
-59;"blue-collar";"married";"secondary";"no";57;"yes";"no";"unknown";5;"may";98;1;-1;0;"unknown";"no"
-33;"services";"married";"secondary";"no";22;"yes";"no";"unknown";5;"may";71;1;-1;0;"unknown";"no"
-56;"blue-collar";"divorced";"primary";"no";8;"yes";"no";"unknown";5;"may";157;2;-1;0;"unknown";"no"
-48;"unemployed";"married";"secondary";"no";293;"yes";"no";"unknown";5;"may";243;1;-1;0;"unknown";"no"
-43;"services";"married";"primary";"no";3;"yes";"no";"unknown";5;"may";186;2;-1;0;"unknown";"no"
-54;"blue-collar";"married";"primary";"no";348;"yes";"no";"unknown";5;"may";579;2;-1;0;"unknown";"no"
-51;"blue-collar";"married";"unknown";"no";-19;"yes";"no";"unknown";5;"may";163;2;-1;0;"unknown";"no"
-26;"student";"single";"secondary";"no";0;"yes";"no";"unknown";5;"may";610;2;-1;0;"unknown";"no"
-40;"management";"married";"tertiary";"no";-4;"yes";"no";"unknown";5;"may";2033;1;-1;0;"unknown";"no"
-39;"management";"married";"secondary";"no";18;"yes";"no";"unknown";5;"may";85;1;-1;0;"unknown";"no"
-50;"technician";"married";"primary";"no";139;"no";"no";"unknown";5;"may";114;2;-1;0;"unknown";"no"
-41;"services";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";114;2;-1;0;"unknown";"no"
-51;"blue-collar";"married";"unknown";"no";1883;"yes";"no";"unknown";5;"may";57;1;-1;0;"unknown";"no"
-60;"retired";"divorced";"secondary";"no";216;"yes";"no";"unknown";5;"may";238;1;-1;0;"unknown";"no"
-52;"blue-collar";"married";"secondary";"no";782;"yes";"no";"unknown";5;"may";93;3;-1;0;"unknown";"no"
-48;"blue-collar";"married";"secondary";"no";904;"yes";"no";"unknown";5;"may";128;2;-1;0;"unknown";"no"
-48;"services";"married";"unknown";"no";1705;"yes";"no";"unknown";5;"may";107;1;-1;0;"unknown";"no"
-39;"technician";"single";"tertiary";"no";47;"yes";"no";"unknown";5;"may";181;1;-1;0;"unknown";"no"
-47;"services";"single";"secondary";"no";176;"yes";"no";"unknown";5;"may";303;2;-1;0;"unknown";"no"
-40;"blue-collar";"married";"secondary";"no";1225;"yes";"no";"unknown";5;"may";558;5;-1;0;"unknown";"no"
-45;"technician";"married";"secondary";"no";86;"yes";"no";"unknown";5;"may";270;1;-1;0;"unknown";"no"
-26;"admin.";"single";"secondary";"no";82;"yes";"no";"unknown";5;"may";228;1;-1;0;"unknown";"no"
-52;"management";"married";"tertiary";"no";271;"yes";"no";"unknown";5;"may";99;1;-1;0;"unknown";"no"
-54;"technician";"married";"secondary";"no";1378;"yes";"no";"unknown";5;"may";240;1;-1;0;"unknown";"no"
-54;"admin.";"married";"tertiary";"no";184;"no";"no";"unknown";5;"may";673;2;-1;0;"unknown";"yes"
-50;"blue-collar";"married";"primary";"no";0;"no";"no";"unknown";5;"may";233;3;-1;0;"unknown";"no"
-35;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";1056;1;-1;0;"unknown";"no"
-44;"services";"married";"secondary";"no";1357;"yes";"yes";"unknown";5;"may";250;1;-1;0;"unknown";"no"
-53;"entrepreneur";"married";"unknown";"no";19;"yes";"no";"unknown";5;"may";252;1;-1;0;"unknown";"no"
-35;"retired";"single";"primary";"no";434;"no";"no";"unknown";5;"may";138;1;-1;0;"unknown";"no"
-60;"admin.";"divorced";"secondary";"no";92;"yes";"no";"unknown";5;"may";130;1;-1;0;"unknown";"no"
-53;"admin.";"divorced";"secondary";"no";1151;"yes";"no";"unknown";5;"may";412;1;-1;0;"unknown";"no"
-48;"unemployed";"married";"secondary";"no";41;"yes";"no";"unknown";5;"may";179;2;-1;0;"unknown";"no"
-34;"technician";"married";"secondary";"no";51;"yes";"no";"unknown";5;"may";19;2;-1;0;"unknown";"no"
-54;"services";"married";"secondary";"no";214;"yes";"no";"unknown";5;"may";458;2;-1;0;"unknown";"no"
-51;"management";"married";"secondary";"no";1161;"yes";"no";"unknown";5;"may";717;1;-1;0;"unknown";"no"
-31;"services";"married";"tertiary";"no";37;"yes";"no";"unknown";5;"may";313;1;-1;0;"unknown";"no"
-35;"technician";"divorced";"secondary";"no";787;"yes";"no";"unknown";5;"may";683;2;-1;0;"unknown";"no"
-35;"services";"married";"secondary";"no";59;"yes";"no";"unknown";5;"may";1077;1;-1;0;"unknown";"no"
-38;"technician";"married";"secondary";"no";253;"yes";"no";"unknown";5;"may";416;1;-1;0;"unknown";"no"
-36;"admin.";"married";"tertiary";"no";211;"yes";"no";"unknown";5;"may";146;2;-1;0;"unknown";"no"
-58;"retired";"married";"primary";"no";235;"yes";"no";"unknown";5;"may";167;1;-1;0;"unknown";"no"
-40;"services";"divorced";"unknown";"no";4384;"yes";"no";"unknown";5;"may";315;1;-1;0;"unknown";"no"
-54;"management";"married";"secondary";"no";4080;"no";"no";"unknown";5;"may";140;1;-1;0;"unknown";"no"
-34;"blue-collar";"single";"secondary";"no";53;"yes";"yes";"unknown";5;"may";346;1;-1;0;"unknown";"no"
-31;"management";"single";"tertiary";"no";0;"yes";"no";"unknown";5;"may";562;1;-1;0;"unknown";"no"
-51;"retired";"married";"secondary";"no";2127;"yes";"no";"unknown";5;"may";172;1;-1;0;"unknown";"no"
-33;"management";"married";"tertiary";"no";377;"yes";"no";"unknown";5;"may";217;1;-1;0;"unknown";"no"
-55;"management";"married";"tertiary";"no";73;"yes";"no";"unknown";5;"may";142;2;-1;0;"unknown";"no"
-42;"admin.";"married";"secondary";"no";445;"yes";"no";"unknown";5;"may";67;1;-1;0;"unknown";"no"
-34;"blue-collar";"married";"secondary";"no";243;"yes";"no";"unknown";5;"may";291;1;-1;0;"unknown";"no"
-33;"blue-collar";"single";"secondary";"no";307;"yes";"no";"unknown";5;"may";309;2;-1;0;"unknown";"no"
-38;"services";"married";"secondary";"no";155;"yes";"no";"unknown";5;"may";248;1;-1;0;"unknown";"no"
-50;"technician";"divorced";"tertiary";"no";173;"no";"yes";"unknown";5;"may";98;1;-1;0;"unknown";"no"
-43;"management";"married";"tertiary";"no";400;"yes";"no";"unknown";5;"may";256;1;-1;0;"unknown";"no"
-61;"blue-collar";"divorced";"primary";"no";1428;"yes";"no";"unknown";5;"may";82;2;-1;0;"unknown";"no"
-47;"admin.";"married";"secondary";"no";219;"yes";"no";"unknown";5;"may";577;1;-1;0;"unknown";"no"
-48;"self-employed";"married";"tertiary";"no";7;"yes";"no";"unknown";5;"may";286;1;-1;0;"unknown";"no"
-44;"blue-collar";"married";"secondary";"no";575;"yes";"no";"unknown";5;"may";477;1;-1;0;"unknown";"no"
-35;"student";"single";"unknown";"no";298;"yes";"no";"unknown";5;"may";611;2;-1;0;"unknown";"no"
-35;"services";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";471;1;-1;0;"unknown";"no"
-50;"services";"married";"secondary";"no";5699;"yes";"no";"unknown";5;"may";381;2;-1;0;"unknown";"no"
-41;"management";"married";"tertiary";"no";176;"yes";"yes";"unknown";5;"may";42;1;-1;0;"unknown";"no"
-41;"management";"married";"tertiary";"no";517;"yes";"no";"unknown";5;"may";251;1;-1;0;"unknown";"no"
-39;"services";"single";"unknown";"no";257;"yes";"no";"unknown";5;"may";408;1;-1;0;"unknown";"no"
-42;"retired";"married";"secondary";"no";56;"yes";"no";"unknown";5;"may";215;1;-1;0;"unknown";"no"
-38;"blue-collar";"married";"secondary";"no";-390;"yes";"no";"unknown";5;"may";287;1;-1;0;"unknown";"no"
-53;"retired";"married";"secondary";"no";330;"yes";"no";"unknown";5;"may";216;2;-1;0;"unknown";"no"
-59;"housemaid";"divorced";"primary";"no";195;"no";"no";"unknown";5;"may";366;2;-1;0;"unknown";"no"
-36;"services";"married";"secondary";"no";301;"yes";"no";"unknown";5;"may";210;1;-1;0;"unknown";"no"
-54;"blue-collar";"married";"primary";"no";-41;"yes";"no";"unknown";5;"may";288;1;-1;0;"unknown";"no"
-40;"technician";"married";"tertiary";"no";483;"yes";"no";"unknown";5;"may";168;1;-1;0;"unknown";"no"
-47;"unknown";"married";"unknown";"no";28;"no";"no";"unknown";5;"may";338;2;-1;0;"unknown";"no"
-53;"unemployed";"married";"unknown";"no";13;"no";"no";"unknown";5;"may";410;3;-1;0;"unknown";"no"
-46;"housemaid";"married";"primary";"no";965;"no";"no";"unknown";5;"may";177;1;-1;0;"unknown";"no"
-39;"management";"married";"tertiary";"no";378;"yes";"yes";"unknown";5;"may";127;2;-1;0;"unknown";"no"
-40;"unemployed";"married";"secondary";"no";219;"yes";"no";"unknown";5;"may";357;1;-1;0;"unknown";"no"
-28;"blue-collar";"married";"primary";"no";324;"yes";"no";"unknown";5;"may";175;1;-1;0;"unknown";"no"
-35;"entrepreneur";"divorced";"secondary";"no";-69;"yes";"no";"unknown";5;"may";300;1;-1;0;"unknown";"no"
-55;"retired";"married";"secondary";"no";0;"no";"yes";"unknown";5;"may";136;1;-1;0;"unknown";"no"
-43;"technician";"divorced";"unknown";"no";205;"yes";"no";"unknown";5;"may";1419;1;-1;0;"unknown";"no"
-48;"blue-collar";"married";"primary";"no";278;"yes";"no";"unknown";5;"may";125;2;-1;0;"unknown";"no"
-58;"management";"married";"unknown";"no";1065;"yes";"no";"unknown";5;"may";213;3;-1;0;"unknown";"no"
-33;"management";"single";"tertiary";"no";34;"yes";"no";"unknown";5;"may";27;1;-1;0;"unknown";"no"
-36;"blue-collar";"married";"unknown";"no";1033;"no";"no";"unknown";5;"may";238;2;-1;0;"unknown";"no"
-53;"services";"divorced";"secondary";"no";1467;"yes";"no";"unknown";5;"may";124;1;-1;0;"unknown";"no"
-47;"blue-collar";"married";"primary";"no";-12;"yes";"no";"unknown";5;"may";18;1;-1;0;"unknown";"no"
-31;"services";"married";"secondary";"no";388;"yes";"no";"unknown";5;"may";730;2;-1;0;"unknown";"no"
-57;"entrepreneur";"married";"secondary";"no";294;"yes";"no";"unknown";5;"may";746;2;-1;0;"unknown";"no"
-53;"blue-collar";"married";"unknown";"no";1827;"no";"no";"unknown";5;"may";121;1;-1;0;"unknown";"no"
-55;"blue-collar";"married";"primary";"no";627;"yes";"no";"unknown";5;"may";247;1;-1;0;"unknown";"no"
-45;"blue-collar";"married";"primary";"no";25;"yes";"no";"unknown";5;"may";40;1;-1;0;"unknown";"no"
-53;"admin.";"divorced";"secondary";"no";315;"yes";"no";"unknown";5;"may";181;2;-1;0;"unknown";"no"
-37;"blue-collar";"married";"primary";"no";0;"yes";"no";"unknown";5;"may";79;1;-1;0;"unknown";"no"
-44;"admin.";"divorced";"secondary";"no";66;"yes";"no";"unknown";5;"may";206;1;-1;0;"unknown";"no"
-49;"blue-collar";"divorced";"primary";"no";-9;"yes";"yes";"unknown";5;"may";389;1;-1;0;"unknown";"no"
-46;"technician";"married";"secondary";"no";349;"yes";"yes";"unknown";5;"may";127;1;-1;0;"unknown";"no"
-43;"entrepreneur";"married";"unknown";"no";100;"yes";"no";"unknown";5;"may";702;1;-1;0;"unknown";"no"
-38;"admin.";"married";"secondary";"no";0;"no";"no";"unknown";5;"may";151;1;-1;0;"unknown";"no"
-43;"technician";"married";"secondary";"no";434;"yes";"no";"unknown";5;"may";117;1;-1;0;"unknown";"no"
-49;"management";"married";"tertiary";"no";3237;"yes";"no";"unknown";5;"may";232;3;-1;0;"unknown";"no"
-42;"management";"married";"unknown";"no";275;"no";"no";"unknown";5;"may";408;2;-1;0;"unknown";"no"
-22;"blue-collar";"single";"secondary";"no";0;"yes";"no";"unknown";5;"may";179;2;-1;0;"unknown";"no"
-40;"management";"married";"tertiary";"no";207;"yes";"no";"unknown";5;"may";39;1;-1;0;"unknown";"no"
-39;"blue-collar";"married";"secondary";"no";483;"yes";"no";"unknown";5;"may";282;1;-1;0;"unknown";"no"
-51;"services";"married";"secondary";"no";2248;"yes";"no";"unknown";5;"may";714;2;-1;0;"unknown";"no"
-49;"admin.";"married";"secondary";"no";428;"yes";"no";"unknown";5;"may";50;1;-1;0;"unknown";"no"
-53;"blue-collar";"married";"secondary";"no";0;"yes";"yes";"unknown";5;"may";181;1;-1;0;"unknown";"no"
-34;"services";"single";"secondary";"no";0;"yes";"no";"unknown";5;"may";142;1;-1;0;"unknown";"no"
-33;"technician";"divorced";"secondary";"no";140;"yes";"no";"unknown";5;"may";227;1;-1;0;"unknown";"no"
-50;"management";"single";"tertiary";"no";297;"yes";"no";"unknown";5;"may";119;1;-1;0;"unknown";"no"
-39;"blue-collar";"married";"primary";"no";279;"yes";"no";"unknown";5;"may";361;1;-1;0;"unknown";"no"
-59;"entrepreneur";"divorced";"secondary";"no";901;"yes";"no";"unknown";5;"may";73;3;-1;0;"unknown";"no"
-30;"technician";"single";"secondary";"no";2573;"yes";"no";"unknown";5;"may";67;2;-1;0;"unknown";"no"
-36;"services";"married";"secondary";"no";143;"yes";"yes";"unknown";5;"may";350;1;-1;0;"unknown";"no"
-40;"blue-collar";"married";"secondary";"no";475;"yes";"no";"unknown";5;"may";332;2;-1;0;"unknown";"no"
-53;"blue-collar";"married";"secondary";"no";70;"yes";"no";"unknown";5;"may";611;2;-1;0;"unknown";"no"
-34;"management";"single";"tertiary";"no";318;"yes";"no";"unknown";5;"may";113;2;-1;0;"unknown";"no"
-37;"technician";"married";"secondary";"no";275;"yes";"no";"unknown";5;"may";132;1;-1;0;"unknown";"no"
-42;"management";"divorced";"tertiary";"no";742;"yes";"no";"unknown";5;"may";58;3;-1;0;"unknown";"no"
-41;"entrepreneur";"married";"primary";"no";236;"yes";"no";"unknown";5;"may";151;1;-1;0;"unknown";"no"
-30;"student";"single";"tertiary";"no";25;"yes";"no";"unknown";5;"may";89;2;-1;0;"unknown";"no"
-37;"management";"single";"tertiary";"no";600;"yes";"no";"unknown";5;"may";152;1;-1;0;"unknown";"no"
-39;"admin.";"divorced";"secondary";"no";-349;"yes";"no";"unknown";5;"may";611;2;-1;0;"unknown";"no"
-41;"blue-collar";"married";"primary";"no";183;"yes";"yes";"unknown";5;"may";110;2;-1;0;"unknown";"no"
-40;"blue-collar";"married";"primary";"no";0;"yes";"no";"unknown";5;"may";463;1;-1;0;"unknown";"no"
-42;"management";"single";"tertiary";"no";0;"yes";"yes";"unknown";5;"may";562;2;-1;0;"unknown";"yes"
-40;"blue-collar";"divorced";"primary";"no";0;"yes";"no";"unknown";5;"may";962;1;-1;0;"unknown";"no"
-57;"technician";"married";"secondary";"no";1078;"yes";"no";"unknown";5;"may";10;4;-1;0;"unknown";"no"
-56;"entrepreneur";"divorced";"secondary";"no";155;"no";"no";"unknown";5;"may";118;3;-1;0;"unknown";"no"
-37;"admin.";"married";"secondary";"no";190;"yes";"no";"unknown";5;"may";92;2;-1;0;"unknown";"no"
-59;"retired";"married";"secondary";"no";319;"yes";"no";"unknown";5;"may";143;3;-1;0;"unknown";"no"
-39;"services";"divorced";"secondary";"no";-185;"yes";"no";"unknown";5;"may";189;3;-1;0;"unknown";"no"
-49;"services";"married";"secondary";"no";47;"no";"no";"unknown";5;"may";234;2;-1;0;"unknown";"no"
-38;"services";"single";"secondary";"no";570;"yes";"no";"unknown";5;"may";75;2;-1;0;"unknown";"no"
-36;"self-employed";"married";"tertiary";"no";19;"no";"no";"unknown";5;"may";189;2;-1;0;"unknown";"no"
-50;"blue-collar";"married";"primary";"no";61;"yes";"no";"unknown";5;"may";621;3;-1;0;"unknown";"no"
-41;"admin.";"married";"secondary";"no";-62;"yes";"yes";"unknown";5;"may";55;2;-1;0;"unknown";"no"
-54;"technician";"married";"tertiary";"no";258;"no";"no";"unknown";5;"may";310;4;-1;0;"unknown";"no"
-58;"blue-collar";"married";"primary";"no";76;"yes";"no";"unknown";5;"may";156;2;-1;0;"unknown";"no"
-30;"blue-collar";"single";"secondary";"no";0;"yes";"no";"unknown";5;"may";5;2;-1;0;"unknown";"no"
-33;"admin.";"single";"secondary";"no";352;"yes";"no";"unknown";5;"may";225;2;-1;0;"unknown";"no"
-47;"admin.";"married";"secondary";"no";368;"yes";"no";"unknown";5;"may";125;2;-1;0;"unknown";"no"
-50;"technician";"single";"tertiary";"no";339;"yes";"no";"unknown";5;"may";2;3;-1;0;"unknown";"no"
-32;"blue-collar";"married";"secondary";"no";1331;"yes";"no";"unknown";5;"may";286;2;-1;0;"unknown";"no"
-40;"self-employed";"married";"secondary";"no";672;"yes";"no";"unknown";5;"may";164;2;-1;0;"unknown";"no"
-37;"management";"married";"tertiary";"no";58;"yes";"no";"unknown";5;"may";98;2;-1;0;"unknown";"no"
-54;"technician";"single";"unknown";"no";447;"yes";"no";"unknown";5;"may";742;2;-1;0;"unknown";"no"
-24;"student";"single";"secondary";"no";423;"yes";"no";"unknown";5;"may";226;3;-1;0;"unknown";"no"
-54;"management";"married";"tertiary";"no";0;"no";"no";"unknown";5;"may";120;2;-1;0;"unknown";"no"
-34;"technician";"married";"secondary";"no";19;"yes";"no";"unknown";5;"may";362;4;-1;0;"unknown";"no"
-56;"technician";"divorced";"primary";"no";13;"yes";"no";"unknown";5;"may";357;2;-1;0;"unknown";"no"
-31;"blue-collar";"single";"secondary";"no";3;"yes";"no";"unknown";5;"may";200;2;-1;0;"unknown";"no"
-24;"student";"single";"secondary";"no";82;"yes";"no";"unknown";5;"may";204;2;-1;0;"unknown";"no"
-42;"blue-collar";"divorced";"primary";"no";28;"yes";"no";"unknown";5;"may";126;3;-1;0;"unknown";"no"
-57;"technician";"married";"secondary";"no";792;"yes";"no";"unknown";5;"may";65;2;-1;0;"unknown";"no"
-42;"blue-collar";"married";"unknown";"no";408;"yes";"no";"unknown";5;"may";107;2;-1;0;"unknown";"no"
-51;"admin.";"married";"secondary";"no";531;"yes";"no";"unknown";5;"may";267;2;-1;0;"unknown";"no"
-57;"retired";"married";"secondary";"no";211;"yes";"no";"unknown";5;"may";248;2;-1;0;"unknown";"no"
-36;"services";"single";"secondary";"no";62;"yes";"no";"unknown";5;"may";215;2;-1;0;"unknown";"no"
-53;"services";"married";"unknown";"no";257;"yes";"no";"unknown";5;"may";209;2;-1;0;"unknown";"no"
-50;"technician";"married";"secondary";"no";1234;"yes";"no";"unknown";5;"may";205;2;-1;0;"unknown";"no"
-54;"management";"married";"tertiary";"no";313;"yes";"no";"unknown";5;"may";83;2;-1;0;"unknown";"no"
-55;"blue-collar";"married";"secondary";"no";89;"yes";"no";"unknown";5;"may";106;3;-1;0;"unknown";"no"
-44;"blue-collar";"married";"secondary";"no";129;"yes";"yes";"unknown";5;"may";189;2;-1;0;"unknown";"no"
-43;"management";"married";"unknown";"no";0;"yes";"no";"unknown";5;"may";105;2;-1;0;"unknown";"no"
-56;"admin.";"married";"secondary";"no";353;"yes";"no";"unknown";5;"may";106;2;-1;0;"unknown";"no"
-54;"technician";"married";"unknown";"no";851;"yes";"no";"unknown";5;"may";108;2;-1;0;"unknown";"no"
-55;"services";"divorced";"primary";"no";96;"yes";"yes";"unknown";5;"may";311;2;-1;0;"unknown";"no"
-37;"services";"divorced";"secondary";"no";398;"yes";"yes";"unknown";5;"may";214;2;-1;0;"unknown";"no"
-33;"admin.";"single";"tertiary";"no";193;"no";"no";"unknown";5;"may";132;2;-1;0;"unknown";"no"
-46;"admin.";"married";"secondary";"no";-358;"yes";"no";"unknown";5;"may";358;2;-1;0;"unknown";"no"
-36;"blue-collar";"married";"secondary";"no";539;"yes";"yes";"unknown";5;"may";453;2;-1;0;"unknown";"no"
-51;"technician";"single";"secondary";"no";0;"yes";"no";"unknown";5;"may";364;2;-1;0;"unknown";"no"
-40;"retired";"single";"primary";"no";0;"no";"no";"unknown";5;"may";136;2;-1;0;"unknown";"no"
-42;"blue-collar";"married";"secondary";"no";490;"yes";"no";"unknown";5;"may";386;2;-1;0;"unknown";"no"
-51;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";173;2;-1;0;"unknown";"no"
-49;"blue-collar";"married";"unknown";"no";403;"yes";"no";"unknown";5;"may";241;2;-1;0;"unknown";"no"
-48;"management";"married";"secondary";"no";161;"yes";"no";"unknown";5;"may";224;3;-1;0;"unknown";"no"
-32;"technician";"divorced";"tertiary";"no";2558;"no";"no";"unknown";5;"may";148;2;-1;0;"unknown";"no"
-31;"admin.";"single";"secondary";"no";98;"yes";"no";"unknown";5;"may";196;2;-1;0;"unknown";"no"
-55;"management";"single";"tertiary";"no";115;"no";"no";"unknown";5;"may";111;4;-1;0;"unknown";"no"
-40;"blue-collar";"single";"secondary";"no";436;"yes";"no";"unknown";5;"may";231;3;-1;0;"unknown";"no"
-47;"technician";"married";"tertiary";"no";831;"yes";"no";"unknown";5;"may";316;3;-1;0;"unknown";"no"
-57;"technician";"married";"unknown";"no";206;"yes";"no";"unknown";5;"may";216;3;-1;0;"unknown";"no"
-41;"blue-collar";"married";"secondary";"no";290;"yes";"no";"unknown";5;"may";240;2;-1;0;"unknown";"no"
-48;"blue-collar";"married";"secondary";"no";1;"no";"no";"unknown";5;"may";669;3;-1;0;"unknown";"no"
-42;"blue-collar";"married";"unknown";"no";57;"yes";"no";"unknown";5;"may";425;2;-1;0;"unknown";"no"
-30;"blue-collar";"single";"secondary";"no";-457;"yes";"no";"unknown";5;"may";143;2;-1;0;"unknown";"no"
-58;"management";"single";"tertiary";"no";1387;"yes";"no";"unknown";5;"may";174;5;-1;0;"unknown";"no"
-45;"management";"divorced";"tertiary";"no";24598;"yes";"no";"unknown";5;"may";313;3;-1;0;"unknown";"no"
-49;"blue-collar";"married";"secondary";"no";30;"yes";"no";"unknown";5;"may";135;4;-1;0;"unknown";"no"
-42;"admin.";"single";"secondary";"no";1022;"yes";"no";"unknown";5;"may";146;2;-1;0;"unknown";"no"
-53;"technician";"married";"secondary";"no";56;"yes";"yes";"unknown";5;"may";152;2;-1;0;"unknown";"no"
-51;"admin.";"single";"secondary";"yes";-2;"no";"no";"unknown";5;"may";402;3;-1;0;"unknown";"no"
-32;"services";"single";"secondary";"no";121;"yes";"no";"unknown";5;"may";213;2;-1;0;"unknown";"no"
-41;"blue-collar";"single";"secondary";"no";842;"yes";"no";"unknown";5;"may";144;3;-1;0;"unknown";"no"
-43;"management";"divorced";"secondary";"no";693;"yes";"no";"unknown";5;"may";124;3;-1;0;"unknown";"no"
-40;"blue-collar";"divorced";"secondary";"no";-333;"yes";"no";"unknown";5;"may";183;2;-1;0;"unknown";"no"
-50;"blue-collar";"married";"primary";"no";1533;"yes";"no";"unknown";5;"may";325;2;-1;0;"unknown";"no"
-34;"management";"married";"tertiary";"no";46;"yes";"no";"unknown";5;"may";39;4;-1;0;"unknown";"no"
-53;"services";"married";"unknown";"no";18;"no";"no";"unknown";5;"may";503;2;-1;0;"unknown";"no"
-45;"technician";"married";"secondary";"no";44;"yes";"no";"unknown";5;"may";95;4;-1;0;"unknown";"no"
-39;"blue-collar";"married";"primary";"no";-100;"yes";"no";"unknown";5;"may";680;2;-1;0;"unknown";"no"
-44;"services";"married";"tertiary";"no";510;"yes";"no";"unknown";5;"may";421;4;-1;0;"unknown";"no"
-55;"management";"married";"tertiary";"no";685;"yes";"no";"unknown";5;"may";174;3;-1;0;"unknown";"no"
-46;"management";"single";"tertiary";"no";187;"yes";"no";"unknown";5;"may";113;2;-1;0;"unknown";"no"
-45;"blue-collar";"married";"secondary";"no";66;"yes";"no";"unknown";5;"may";808;2;-1;0;"unknown";"no"
-34;"admin.";"married";"secondary";"no";560;"yes";"no";"unknown";5;"may";198;3;-1;0;"unknown";"no"
-36;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";195;2;-1;0;"unknown";"no"
-59;"unknown";"divorced";"unknown";"no";27;"no";"no";"unknown";5;"may";347;3;-1;0;"unknown";"no"
-31;"admin.";"single";"secondary";"no";12;"yes";"no";"unknown";5;"may";208;2;-1;0;"unknown";"no"
-44;"blue-collar";"single";"secondary";"no";34;"yes";"no";"unknown";5;"may";404;4;-1;0;"unknown";"no"
-33;"entrepreneur";"single";"tertiary";"no";1068;"yes";"no";"unknown";5;"may";396;2;-1;0;"unknown";"no"
-40;"blue-collar";"married";"secondary";"no";211;"yes";"no";"unknown";5;"may";216;4;-1;0;"unknown";"no"
-46;"admin.";"single";"tertiary";"no";377;"yes";"no";"unknown";5;"may";98;2;-1;0;"unknown";"no"
-48;"management";"married";"tertiary";"no";263;"yes";"no";"unknown";5;"may";350;2;-1;0;"unknown";"no"
-42;"services";"married";"secondary";"no";1263;"yes";"no";"unknown";6;"may";114;2;-1;0;"unknown";"no"
-27;"services";"married";"secondary";"no";8;"yes";"no";"unknown";6;"may";88;3;-1;0;"unknown";"no"
-48;"admin.";"married";"secondary";"no";126;"yes";"yes";"unknown";6;"may";379;2;-1;0;"unknown";"no"
-59;"admin.";"married";"secondary";"no";230;"yes";"no";"unknown";6;"may";190;3;-1;0;"unknown";"no"
-46;"technician";"married";"tertiary";"no";841;"yes";"no";"unknown";6;"may";158;1;-1;0;"unknown";"no"
-38;"admin.";"divorced";"secondary";"no";308;"yes";"no";"unknown";6;"may";102;1;-1;0;"unknown";"no"
-43;"management";"divorced";"tertiary";"no";1;"yes";"no";"unknown";6;"may";306;1;-1;0;"unknown";"no"
-38;"admin.";"divorced";"tertiary";"no";86;"yes";"no";"unknown";6;"may";218;1;-1;0;"unknown";"no"
-23;"student";"single";"secondary";"no";157;"yes";"no";"unknown";6;"may";54;1;-1;0;"unknown";"no"
-34;"services";"married";"secondary";"no";22;"yes";"no";"unknown";6;"may";344;1;-1;0;"unknown";"no"
-38;"admin.";"married";"secondary";"no";46;"yes";"yes";"unknown";6;"may";195;1;-1;0;"unknown";"no"
-37;"blue-collar";"married";"secondary";"no";1293;"no";"no";"unknown";6;"may";652;1;-1;0;"unknown";"no"
-25;"admin.";"single";"secondary";"no";122;"yes";"no";"unknown";6;"may";286;1;-1;0;"unknown";"no"
-48;"blue-collar";"married";"unknown";"no";131;"yes";"no";"unknown";6;"may";189;1;-1;0;"unknown";"no"
-49;"blue-collar";"single";"secondary";"no";143;"yes";"no";"unknown";6;"may";83;1;-1;0;"unknown";"no"
-38;"admin.";"single";"secondary";"no";393;"no";"no";"unknown";6;"may";184;2;-1;0;"unknown";"no"
-43;"blue-collar";"married";"primary";"no";98;"yes";"no";"unknown";6;"may";235;1;-1;0;"unknown";"no"
-33;"management";"married";"tertiary";"no";0;"yes";"no";"unknown";6;"may";290;1;-1;0;"unknown";"no"
-55;"blue-collar";"married";"secondary";"no";224;"yes";"no";"unknown";6;"may";232;1;-1;0;"unknown";"no"
-38;"blue-collar";"married";"secondary";"no";757;"yes";"no";"unknown";6;"may";133;1;-1;0;"unknown";"no"
-49;"services";"married";"secondary";"no";245;"yes";"yes";"unknown";6;"may";318;1;-1;0;"unknown";"no"
-40;"management";"married";"secondary";"no";8486;"no";"no";"unknown";6;"may";260;3;-1;0;"unknown";"no"
-43;"admin.";"married";"unknown";"no";350;"no";"no";"unknown";6;"may";437;1;-1;0;"unknown";"no"
-38;"blue-collar";"married";"secondary";"no";20;"yes";"no";"unknown";6;"may";402;1;-1;0;"unknown";"no"
-58;"services";"married";"secondary";"no";1667;"yes";"yes";"unknown";6;"may";85;1;-1;0;"unknown";"no"
-57;"technician";"married";"unknown";"no";345;"yes";"no";"unknown";6;"may";125;1;-1;0;"unknown";"no"
-32;"unemployed";"married";"secondary";"no";10;"yes";"no";"unknown";6;"may";501;4;-1;0;"unknown";"no"
-56;"management";"married";"tertiary";"no";830;"yes";"yes";"unknown";6;"may";1201;1;-1;0;"unknown";"yes"
-58;"blue-collar";"divorced";"unknown";"no";29;"yes";"no";"unknown";6;"may";253;1;-1;0;"unknown";"no"
-60;"retired";"divorced";"secondary";"no";545;"yes";"no";"unknown";6;"may";1030;1;-1;0;"unknown";"yes"
-37;"technician";"married";"tertiary";"no";8730;"yes";"no";"unknown";6;"may";149;1;-1;0;"unknown";"no"
-46;"technician";"divorced";"tertiary";"no";477;"yes";"no";"unknown";6;"may";114;1;-1;0;"unknown";"no"
-27;"admin.";"married";"secondary";"no";4;"yes";"no";"unknown";6;"may";243;1;-1;0;"unknown";"no"
-43;"blue-collar";"married";"secondary";"no";1205;"yes";"no";"unknown";6;"may";769;2;-1;0;"unknown";"no"
-32;"technician";"single";"secondary";"no";0;"yes";"yes";"unknown";6;"may";135;3;-1;0;"unknown";"no"
-40;"admin.";"single";"secondary";"no";263;"yes";"no";"unknown";6;"may";231;1;-1;0;"unknown";"no"
-38;"admin.";"married";"secondary";"no";1;"no";"no";"unknown";6;"may";76;1;-1;0;"unknown";"no"
-34;"blue-collar";"married";"secondary";"no";283;"no";"yes";"unknown";6;"may";199;1;-1;0;"unknown";"no"
-47;"blue-collar";"married";"primary";"no";206;"yes";"no";"unknown";6;"may";152;1;-1;0;"unknown";"no"
-42;"housemaid";"married";"primary";"no";17;"yes";"no";"unknown";6;"may";124;1;-1;0;"unknown";"no"
-48;"technician";"married";"secondary";"no";141;"yes";"yes";"unknown";6;"may";424;1;-1;0;"unknown";"no"
-29;"self-employed";"single";"tertiary";"no";16;"yes";"no";"unknown";6;"may";43;1;-1;0;"unknown";"no"
-50;"services";"married";"secondary";"no";206;"yes";"no";"unknown";6;"may";154;1;-1;0;"unknown";"no"
-52;"technician";"married";"unknown";"no";7;"no";"no";"unknown";6;"may";203;2;-1;0;"unknown";"no"
-50;"management";"married";"tertiary";"no";0;"no";"no";"unknown";6;"may";326;1;-1;0;"unknown";"no"
-58;"retired";"married";"tertiary";"no";0;"no";"no";"unknown";6;"may";393;1;-1;0;"unknown";"no"
-46;"blue-collar";"divorced";"primary";"no";1927;"yes";"no";"unknown";6;"may";241;3;-1;0;"unknown";"no"
-38;"technician";"married";"secondary";"no";284;"yes";"no";"unknown";6;"may";483;1;-1;0;"unknown";"no"
-46;"blue-collar";"married";"secondary";"no";1660;"yes";"no";"unknown";6;"may";259;1;-1;0;"unknown";"no"
-32;"services";"single";"secondary";"no";406;"yes";"no";"unknown";6;"may";227;1;-1;0;"unknown";"no"
-51;"blue-collar";"married";"primary";"no";230;"yes";"no";"unknown";6;"may";673;1;-1;0;"unknown";"no"
-39;"admin.";"single";"secondary";"no";-25;"yes";"no";"unknown";6;"may";576;1;-1;0;"unknown";"no"
-48;"admin.";"married";"secondary";"no";182;"yes";"no";"unknown";6;"may";180;2;-1;0;"unknown";"no"
-36;"entrepreneur";"married";"tertiary";"no";1169;"yes";"no";"unknown";6;"may";168;2;-1;0;"unknown";"no"
-34;"admin.";"divorced";"secondary";"no";67;"yes";"no";"unknown";6;"may";90;1;-1;0;"unknown";"no"
-40;"technician";"married";"secondary";"no";77;"no";"no";"unknown";6;"may";505;1;-1;0;"unknown";"no"
-43;"unemployed";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";245;1;-1;0;"unknown";"no"
-52;"blue-collar";"divorced";"primary";"no";55;"yes";"yes";"unknown";6;"may";186;1;-1;0;"unknown";"no"
-33;"technician";"married";"secondary";"yes";72;"yes";"no";"unknown";6;"may";623;1;-1;0;"unknown";"no"
-49;"management";"single";"tertiary";"no";163;"yes";"no";"unknown";6;"may";496;3;-1;0;"unknown";"no"
-32;"management";"single";"tertiary";"no";151;"yes";"no";"unknown";6;"may";118;1;-1;0;"unknown";"no"
-39;"admin.";"single";"secondary";"no";113;"yes";"no";"unknown";6;"may";342;1;-1;0;"unknown";"no"
-40;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";225;1;-1;0;"unknown";"no"
-38;"technician";"single";"tertiary";"no";9;"yes";"no";"unknown";6;"may";185;3;-1;0;"unknown";"no"
-43;"management";"married";"secondary";"no";375;"yes";"no";"unknown";6;"may";196;2;-1;0;"unknown";"no"
-39;"services";"married";"secondary";"no";1142;"yes";"no";"unknown";6;"may";276;1;-1;0;"unknown";"no"
-54;"blue-collar";"married";"primary";"no";2102;"yes";"no";"unknown";6;"may";76;1;-1;0;"unknown";"no"
-38;"technician";"single";"tertiary";"no";4325;"yes";"no";"unknown";6;"may";87;1;-1;0;"unknown";"no"
-40;"blue-collar";"married";"secondary";"no";217;"yes";"no";"unknown";6;"may";195;1;-1;0;"unknown";"no"
-55;"admin.";"married";"secondary";"no";131;"yes";"no";"unknown";6;"may";744;1;-1;0;"unknown";"no"
-42;"management";"married";"tertiary";"no";1680;"yes";"no";"unknown";6;"may";765;1;-1;0;"unknown";"no"
-39;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";262;1;-1;0;"unknown";"no"
-40;"blue-collar";"married";"primary";"no";46;"yes";"no";"unknown";6;"may";119;1;-1;0;"unknown";"no"
-32;"blue-collar";"married";"secondary";"no";320;"yes";"no";"unknown";6;"may";198;2;-1;0;"unknown";"no"
-55;"admin.";"married";"secondary";"no";183;"yes";"no";"unknown";6;"may";150;1;-1;0;"unknown";"no"
-45;"blue-collar";"married";"secondary";"no";39;"no";"no";"unknown";6;"may";241;1;-1;0;"unknown";"no"
-35;"management";"single";"tertiary";"no";560;"yes";"no";"unknown";6;"may";181;1;-1;0;"unknown";"no"
-58;"technician";"divorced";"secondary";"no";469;"no";"no";"unknown";6;"may";196;1;-1;0;"unknown";"no"
-35;"admin.";"married";"secondary";"no";530;"yes";"no";"unknown";6;"may";149;1;-1;0;"unknown";"no"
-49;"services";"married";"primary";"no";61;"yes";"no";"unknown";6;"may";246;1;-1;0;"unknown";"no"
-34;"technician";"single";"tertiary";"no";242;"yes";"no";"unknown";6;"may";112;1;-1;0;"unknown";"no"
-36;"blue-collar";"married";"secondary";"no";139;"yes";"no";"unknown";6;"may";309;2;-1;0;"unknown";"no"
-24;"self-employed";"single";"secondary";"no";0;"yes";"no";"unknown";6;"may";278;1;-1;0;"unknown";"no"
-34;"technician";"married";"secondary";"no";367;"yes";"no";"unknown";6;"may";140;1;-1;0;"unknown";"no"
-51;"admin.";"divorced";"secondary";"no";228;"yes";"no";"unknown";6;"may";136;1;-1;0;"unknown";"no"
-39;"technician";"single";"unknown";"no";45248;"yes";"no";"unknown";6;"may";1623;1;-1;0;"unknown";"yes"
-50;"self-employed";"married";"unknown";"no";-84;"yes";"no";"unknown";6;"may";101;1;-1;0;"unknown";"no"
-32;"services";"single";"secondary";"no";310;"yes";"no";"unknown";6;"may";144;1;-1;0;"unknown";"no"
-42;"blue-collar";"married";"unknown";"no";132;"yes";"no";"unknown";6;"may";238;1;-1;0;"unknown";"no"
-50;"technician";"married";"secondary";"no";797;"yes";"no";"unknown";6;"may";354;1;-1;0;"unknown";"no"
-40;"services";"married";"secondary";"no";71;"no";"no";"unknown";6;"may";79;1;-1;0;"unknown";"no"
-46;"management";"divorced";"unknown";"no";2;"yes";"no";"unknown";6;"may";187;1;-1;0;"unknown";"no"
-37;"management";"married";"tertiary";"no";231;"yes";"yes";"unknown";6;"may";451;2;-1;0;"unknown";"no"
-34;"blue-collar";"married";"secondary";"no";270;"yes";"yes";"unknown";6;"may";159;1;-1;0;"unknown";"no"
-44;"blue-collar";"married";"secondary";"no";274;"yes";"yes";"unknown";6;"may";409;1;-1;0;"unknown";"no"
-40;"admin.";"single";"secondary";"no";-109;"yes";"yes";"unknown";6;"may";170;1;-1;0;"unknown";"no"
-37;"technician";"married";"secondary";"no";1;"yes";"no";"unknown";6;"may";608;1;-1;0;"unknown";"yes"
-33;"blue-collar";"single";"secondary";"yes";-60;"no";"no";"unknown";6;"may";243;1;-1;0;"unknown";"no"
-35;"blue-collar";"married";"secondary";"no";89;"yes";"no";"unknown";6;"may";145;2;-1;0;"unknown";"no"
-58;"blue-collar";"divorced";"secondary";"no";-11;"no";"no";"unknown";6;"may";112;1;-1;0;"unknown";"no"
-39;"blue-collar";"married";"primary";"no";0;"yes";"no";"unknown";6;"may";262;2;-1;0;"unknown";"no"
-43;"blue-collar";"married";"secondary";"no";-509;"yes";"no";"unknown";6;"may";124;1;-1;0;"unknown";"no"
-39;"unemployed";"married";"primary";"no";408;"yes";"no";"unknown";6;"may";53;1;-1;0;"unknown";"no"
-36;"services";"single";"primary";"no";58;"yes";"no";"unknown";6;"may";134;1;-1;0;"unknown";"no"
-57;"retired";"single";"secondary";"no";1640;"no";"yes";"unknown";6;"may";204;4;-1;0;"unknown";"no"
-36;"admin.";"single";"secondary";"no";20;"yes";"no";"unknown";6;"may";186;1;-1;0;"unknown";"no"
-50;"blue-collar";"married";"primary";"no";71;"yes";"no";"unknown";6;"may";678;1;-1;0;"unknown";"no"
-55;"blue-collar";"married";"secondary";"no";52;"yes";"no";"unknown";6;"may";182;1;-1;0;"unknown";"no"
-44;"self-employed";"married";"tertiary";"no";292;"yes";"no";"unknown";6;"may";162;1;-1;0;"unknown";"no"
-44;"services";"divorced";"secondary";"no";424;"yes";"no";"unknown";6;"may";27;1;-1;0;"unknown";"no"
-39;"housemaid";"single";"primary";"no";109;"yes";"no";"unknown";6;"may";699;3;-1;0;"unknown";"no"
-46;"blue-collar";"married";"unknown";"no";1044;"yes";"no";"unknown";6;"may";43;1;-1;0;"unknown";"no"
-39;"blue-collar";"married";"secondary";"no";983;"yes";"no";"unknown";6;"may";97;1;-1;0;"unknown";"no"
-34;"admin.";"married";"secondary";"no";869;"no";"no";"unknown";6;"may";1677;1;-1;0;"unknown";"yes"
-40;"blue-collar";"married";"primary";"no";668;"yes";"no";"unknown";6;"may";283;2;-1;0;"unknown";"no"
-50;"management";"married";"tertiary";"no";964;"yes";"no";"unknown";6;"may";323;1;-1;0;"unknown";"no"
-31;"management";"single";"secondary";"no";301;"yes";"no";"unknown";6;"may";82;1;-1;0;"unknown";"no"
-37;"admin.";"single";"secondary";"no";140;"yes";"no";"unknown";6;"may";310;1;-1;0;"unknown";"no"
-39;"management";"single";"secondary";"no";1877;"yes";"no";"unknown";6;"may";185;1;-1;0;"unknown";"no"
-51;"blue-collar";"married";"primary";"no";1127;"yes";"no";"unknown";6;"may";47;1;-1;0;"unknown";"no"
-41;"technician";"married";"secondary";"no";871;"yes";"no";"unknown";6;"may";145;1;-1;0;"unknown";"no"
-41;"technician";"married";"secondary";"no";767;"yes";"yes";"unknown";6;"may";204;1;-1;0;"unknown";"no"
-43;"blue-collar";"married";"secondary";"no";0;"no";"no";"unknown";6;"may";187;1;-1;0;"unknown";"no"
-30;"services";"single";"secondary";"no";209;"yes";"no";"unknown";6;"may";30;2;-1;0;"unknown";"no"
-54;"management";"divorced";"primary";"no";0;"no";"no";"unknown";6;"may";472;1;-1;0;"unknown";"no"
-43;"blue-collar";"divorced";"secondary";"no";110;"yes";"yes";"unknown";6;"may";448;1;-1;0;"unknown";"no"
-59;"management";"divorced";"tertiary";"no";-76;"yes";"yes";"unknown";6;"may";264;1;-1;0;"unknown";"no"
-47;"technician";"married";"unknown";"no";178;"yes";"no";"unknown";6;"may";169;1;-1;0;"unknown";"no"
-40;"blue-collar";"married";"primary";"no";-66;"yes";"no";"unknown";6;"may";288;1;-1;0;"unknown";"no"
-32;"technician";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";176;2;-1;0;"unknown";"no"
-29;"blue-collar";"married";"secondary";"no";1;"yes";"no";"unknown";6;"may";215;1;-1;0;"unknown";"no"
-36;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";337;1;-1;0;"unknown";"no"
-55;"unemployed";"married";"tertiary";"no";5345;"no";"no";"unknown";6;"may";278;1;-1;0;"unknown";"no"
-30;"blue-collar";"divorced";"secondary";"no";-209;"yes";"no";"unknown";6;"may";188;2;-1;0;"unknown";"no"
-39;"admin.";"single";"secondary";"no";0;"yes";"no";"unknown";6;"may";174;2;-1;0;"unknown";"no"
-39;"blue-collar";"divorced";"secondary";"no";42;"yes";"no";"unknown";6;"may";226;2;-1;0;"unknown";"no"
-50;"blue-collar";"divorced";"secondary";"no";41;"yes";"no";"unknown";6;"may";190;1;-1;0;"unknown";"no"
-44;"blue-collar";"married";"secondary";"no";-99;"yes";"no";"unknown";6;"may";111;2;-1;0;"unknown";"no"
-37;"technician";"single";"secondary";"no";17;"yes";"no";"unknown";6;"may";164;1;-1;0;"unknown";"no"
-46;"admin.";"married";"primary";"no";276;"yes";"yes";"unknown";6;"may";157;2;-1;0;"unknown";"no"
-32;"technician";"single";"unknown";"no";-170;"no";"no";"unknown";6;"may";46;1;-1;0;"unknown";"no"
-37;"management";"single";"tertiary";"no";230;"yes";"yes";"unknown";6;"may";374;1;-1;0;"unknown";"no"
-29;"blue-collar";"married";"secondary";"no";9;"yes";"no";"unknown";6;"may";349;1;-1;0;"unknown";"no"
-41;"blue-collar";"married";"secondary";"no";946;"yes";"no";"unknown";6;"may";325;1;-1;0;"unknown";"no"
-45;"blue-collar";"married";"primary";"no";1297;"yes";"no";"unknown";6;"may";233;2;-1;0;"unknown";"no"
-57;"retired";"divorced";"secondary";"no";-331;"yes";"no";"unknown";6;"may";531;1;-1;0;"unknown";"no"
-48;"blue-collar";"single";"secondary";"no";44;"yes";"no";"unknown";6;"may";153;1;-1;0;"unknown";"no"
-60;"retired";"married";"secondary";"yes";15;"no";"no";"unknown";6;"may";80;1;-1;0;"unknown";"no"
-26;"admin.";"single";"secondary";"no";712;"yes";"no";"unknown";6;"may";232;1;-1;0;"unknown";"no"
-58;"retired";"married";"secondary";"no";5435;"yes";"no";"unknown";6;"may";118;1;-1;0;"unknown";"no"
-34;"admin.";"married";"secondary";"no";507;"yes";"no";"unknown";6;"may";190;1;-1;0;"unknown";"no"
-55;"unemployed";"divorced";"secondary";"no";387;"yes";"no";"unknown";6;"may";918;1;-1;0;"unknown";"yes"
-41;"blue-collar";"married";"primary";"no";0;"yes";"yes";"unknown";6;"may";238;1;-1;0;"unknown";"no"
-50;"management";"divorced";"secondary";"no";1716;"yes";"no";"unknown";6;"may";82;1;-1;0;"unknown";"no"
-49;"entrepreneur";"married";"secondary";"no";167;"yes";"yes";"unknown";6;"may";198;3;-1;0;"unknown";"no"
-44;"admin.";"married";"unknown";"no";40;"no";"yes";"unknown";6;"may";160;2;-1;0;"unknown";"no"
-44;"blue-collar";"married";"primary";"no";148;"yes";"no";"unknown";6;"may";211;1;-1;0;"unknown";"no"
-31;"technician";"married";"secondary";"no";17;"yes";"yes";"unknown";6;"may";120;1;-1;0;"unknown";"no"
-34;"blue-collar";"single";"tertiary";"no";1011;"yes";"no";"unknown";6;"may";136;1;-1;0;"unknown";"no"
-46;"management";"single";"unknown";"no";1527;"yes";"no";"unknown";6;"may";269;1;-1;0;"unknown";"no"
-42;"management";"married";"tertiary";"no";744;"no";"no";"unknown";6;"may";157;1;-1;0;"unknown";"no"
-52;"admin.";"married";"secondary";"no";484;"yes";"no";"unknown";6;"may";128;1;-1;0;"unknown";"no"
-29;"management";"single";"tertiary";"no";0;"yes";"no";"unknown";6;"may";211;1;-1;0;"unknown";"no"
-53;"retired";"married";"primary";"no";136;"yes";"no";"unknown";6;"may";267;2;-1;0;"unknown";"no"
-43;"blue-collar";"married";"secondary";"no";1335;"yes";"no";"unknown";6;"may";371;2;-1;0;"unknown";"no"
-38;"management";"married";"secondary";"no";517;"yes";"no";"unknown";6;"may";288;2;-1;0;"unknown";"no"
-46;"management";"married";"tertiary";"no";459;"yes";"no";"unknown";6;"may";221;1;-1;0;"unknown";"no"
-48;"management";"divorced";"unknown";"no";549;"yes";"no";"unknown";6;"may";427;1;-1;0;"unknown";"no"
-30;"admin.";"divorced";"secondary";"no";83;"yes";"yes";"unknown";6;"may";310;1;-1;0;"unknown";"no"
-44;"blue-collar";"married";"primary";"no";213;"no";"no";"unknown";6;"may";158;1;-1;0;"unknown";"no"
-31;"housemaid";"married";"primary";"no";203;"yes";"no";"unknown";6;"may";604;3;-1;0;"unknown";"no"
-42;"services";"single";"secondary";"no";518;"yes";"no";"unknown";6;"may";198;1;-1;0;"unknown";"no"
-40;"management";"single";"tertiary";"no";3877;"yes";"no";"unknown";6;"may";145;1;-1;0;"unknown";"no"
-52;"admin.";"married";"secondary";"no";1236;"yes";"no";"unknown";6;"may";247;1;-1;0;"unknown";"no"
-45;"blue-collar";"divorced";"secondary";"no";756;"yes";"no";"unknown";6;"may";179;2;-1;0;"unknown";"no"
-48;"blue-collar";"married";"secondary";"no";157;"yes";"no";"unknown";6;"may";73;1;-1;0;"unknown";"no"
-45;"blue-collar";"married";"primary";"no";-66;"yes";"no";"unknown";6;"may";263;2;-1;0;"unknown";"no"
-34;"blue-collar";"married";"unknown";"no";245;"yes";"no";"unknown";6;"may";13;1;-1;0;"unknown";"no"
-34;"blue-collar";"married";"primary";"no";-144;"yes";"no";"unknown";6;"may";79;2;-1;0;"unknown";"no"
-46;"blue-collar";"married";"secondary";"no";71;"yes";"no";"unknown";6;"may";416;1;-1;0;"unknown";"no"
-49;"services";"divorced";"secondary";"no";505;"yes";"no";"unknown";6;"may";162;1;-1;0;"unknown";"no"
-50;"technician";"married";"primary";"no";249;"yes";"no";"unknown";6;"may";129;1;-1;0;"unknown";"no"
-34;"admin.";"single";"secondary";"no";0;"yes";"no";"unknown";6;"may";150;1;-1;0;"unknown";"no"
-40;"unemployed";"single";"secondary";"no";11;"yes";"no";"unknown";6;"may";43;1;-1;0;"unknown";"no"
-36;"admin.";"married";"secondary";"no";639;"yes";"no";"unknown";6;"may";191;1;-1;0;"unknown";"no"
-59;"blue-collar";"divorced";"unknown";"no";124;"yes";"no";"unknown";6;"may";26;1;-1;0;"unknown";"no"
-45;"blue-collar";"married";"secondary";"no";82;"yes";"no";"unknown";6;"may";250;1;-1;0;"unknown";"no"
-36;"self-employed";"married";"tertiary";"no";107;"yes";"no";"unknown";6;"may";146;1;-1;0;"unknown";"no"
-56;"services";"married";"secondary";"no";473;"yes";"no";"unknown";6;"may";416;1;-1;0;"unknown";"no"
-42;"services";"divorced";"secondary";"no";372;"yes";"yes";"unknown";6;"may";121;2;-1;0;"unknown";"no"
-30;"admin.";"married";"secondary";"no";46;"yes";"no";"unknown";6;"may";114;2;-1;0;"unknown";"no"
-30;"student";"single";"tertiary";"no";34;"yes";"no";"unknown";6;"may";289;1;-1;0;"unknown";"no"
-47;"self-employed";"married";"unknown";"no";935;"yes";"no";"unknown";6;"may";225;1;-1;0;"unknown";"no"
-33;"blue-collar";"married";"secondary";"no";-10;"yes";"no";"unknown";6;"may";123;1;-1;0;"unknown";"no"
-36;"admin.";"married";"secondary";"no";-106;"yes";"no";"unknown";6;"may";130;2;-1;0;"unknown";"no"
-39;"services";"divorced";"primary";"no";471;"yes";"no";"unknown";6;"may";161;2;-1;0;"unknown";"no"
-56;"admin.";"divorced";"secondary";"no";778;"yes";"no";"unknown";6;"may";149;2;-1;0;"unknown";"no"
-39;"blue-collar";"divorced";"unknown";"no";170;"yes";"no";"unknown";6;"may";268;2;-1;0;"unknown";"no"
-42;"technician";"married";"secondary";"no";315;"yes";"no";"unknown";6;"may";259;2;-1;0;"unknown";"no"
-52;"blue-collar";"married";"secondary";"no";3165;"no";"no";"unknown";6;"may";26;1;-1;0;"unknown";"no"
-36;"admin.";"divorced";"secondary";"no";131;"yes";"no";"unknown";6;"may";153;1;-1;0;"unknown";"no"
-35;"entrepreneur";"married";"secondary";"yes";204;"yes";"no";"unknown";6;"may";424;2;-1;0;"unknown";"no"
-47;"technician";"married";"secondary";"no";83;"yes";"no";"unknown";6;"may";179;2;-1;0;"unknown";"no"
-59;"services";"divorced";"secondary";"no";0;"yes";"yes";"unknown";6;"may";97;1;-1;0;"unknown";"no"
-57;"blue-collar";"married";"primary";"no";5431;"yes";"yes";"unknown";6;"may";383;1;-1;0;"unknown";"no"
-38;"management";"married";"unknown";"no";1759;"yes";"no";"unknown";6;"may";440;1;-1;0;"unknown";"no"
-46;"unemployed";"married";"secondary";"no";-125;"yes";"no";"unknown";6;"may";23;1;-1;0;"unknown";"no"
-34;"blue-collar";"married";"secondary";"no";0;"no";"no";"unknown";6;"may";195;1;-1;0;"unknown";"no"
-28;"services";"single";"secondary";"no";5090;"yes";"no";"unknown";6;"may";1297;3;-1;0;"unknown";"yes"
-38;"technician";"married";"unknown";"no";573;"yes";"no";"unknown";6;"may";87;1;-1;0;"unknown";"no"
-56;"blue-collar";"married";"secondary";"no";1602;"yes";"no";"unknown";6;"may";427;1;-1;0;"unknown";"no"
-41;"blue-collar";"single";"primary";"yes";-137;"yes";"yes";"unknown";6;"may";189;1;-1;0;"unknown";"no"
-52;"technician";"married";"unknown";"no";0;"no";"no";"unknown";6;"may";195;1;-1;0;"unknown";"no"
-54;"services";"married";"secondary";"no";193;"no";"no";"unknown";6;"may";179;1;-1;0;"unknown";"no"
-61;"retired";"married";"secondary";"no";195;"yes";"yes";"unknown";6;"may";179;1;-1;0;"unknown";"no"
-53;"entrepreneur";"married";"secondary";"no";288;"no";"no";"unknown";6;"may";69;1;-1;0;"unknown";"no"
-47;"technician";"married";"secondary";"no";19;"yes";"no";"unknown";6;"may";105;2;-1;0;"unknown";"no"
-53;"blue-collar";"married";"primary";"no";25;"yes";"no";"unknown";6;"may";266;3;-1;0;"unknown";"no"
-46;"services";"married";"secondary";"no";216;"yes";"no";"unknown";6;"may";524;2;-1;0;"unknown";"no"
-39;"blue-collar";"divorced";"primary";"no";190;"yes";"yes";"unknown";6;"may";96;2;-1;0;"unknown";"no"
-56;"technician";"divorced";"secondary";"no";99;"yes";"no";"unknown";6;"may";155;2;-1;0;"unknown";"no"
-55;"services";"divorced";"primary";"no";2298;"yes";"no";"unknown";6;"may";162;2;-1;0;"unknown";"no"
-44;"management";"married";"tertiary";"no";17;"yes";"no";"unknown";6;"may";352;2;-1;0;"unknown";"no"
-37;"technician";"married";"primary";"no";0;"yes";"no";"unknown";6;"may";76;4;-1;0;"unknown";"no"
-35;"blue-collar";"married";"primary";"no";0;"yes";"no";"unknown";6;"may";154;2;-1;0;"unknown";"no"
-55;"blue-collar";"married";"secondary";"no";840;"yes";"no";"unknown";6;"may";310;2;-1;0;"unknown";"no"
-37;"services";"married";"secondary";"no";358;"yes";"no";"unknown";6;"may";390;3;-1;0;"unknown";"no"
-30;"technician";"single";"secondary";"no";0;"yes";"no";"unknown";6;"may";369;1;-1;0;"unknown";"no"
-37;"blue-collar";"married";"primary";"no";-325;"yes";"yes";"unknown";6;"may";112;2;-1;0;"unknown";"no"
-36;"technician";"single";"secondary";"no";-15;"yes";"no";"unknown";6;"may";341;3;-1;0;"unknown";"no"
-38;"technician";"married";"secondary";"no";581;"yes";"no";"unknown";6;"may";79;1;-1;0;"unknown";"no"
-41;"admin.";"divorced";"primary";"no";4070;"yes";"no";"unknown";6;"may";140;2;-1;0;"unknown";"no"
-48;"retired";"married";"secondary";"no";74;"no";"yes";"unknown";6;"may";315;1;-1;0;"unknown";"no"
-55;"services";"divorced";"secondary";"no";141;"yes";"no";"unknown";6;"may";262;2;-1;0;"unknown";"no"
-28;"services";"divorced";"secondary";"no";89;"no";"no";"unknown";6;"may";174;2;-1;0;"unknown";"no"
-54;"services";"married";"secondary";"yes";0;"yes";"no";"unknown";6;"may";138;3;-1;0;"unknown";"no"
-30;"blue-collar";"married";"secondary";"no";450;"no";"no";"unknown";6;"may";526;2;-1;0;"unknown";"no"
-48;"technician";"married";"tertiary";"no";310;"no";"no";"unknown";6;"may";135;1;-1;0;"unknown";"no"
-31;"self-employed";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";36;5;-1;0;"unknown";"no"
-38;"blue-collar";"married";"secondary";"no";384;"yes";"no";"unknown";6;"may";1906;3;-1;0;"unknown";"no"
-37;"blue-collar";"married";"secondary";"no";395;"yes";"no";"unknown";6;"may";219;2;-1;0;"unknown";"no"
-37;"services";"single";"unknown";"no";-118;"yes";"no";"unknown";6;"may";147;2;-1;0;"unknown";"no"
-56;"blue-collar";"married";"primary";"no";5;"yes";"yes";"unknown";6;"may";407;2;-1;0;"unknown";"no"
-51;"blue-collar";"married";"secondary";"no";50;"yes";"yes";"unknown";6;"may";121;1;-1;0;"unknown";"no"
-39;"blue-collar";"married";"secondary";"no";285;"yes";"yes";"unknown";6;"may";209;1;-1;0;"unknown";"no"
-49;"technician";"married";"unknown";"no";15;"no";"no";"unknown";6;"may";92;2;-1;0;"unknown";"no"
-51;"blue-collar";"married";"primary";"no";653;"yes";"yes";"unknown";6;"may";208;1;-1;0;"unknown";"no"
-43;"self-employed";"married";"secondary";"no";918;"yes";"no";"unknown";6;"may";193;1;-1;0;"unknown";"no"
-32;"services";"married";"secondary";"no";243;"yes";"yes";"unknown";6;"may";144;1;-1;0;"unknown";"no"
-29;"technician";"single";"tertiary";"no";405;"yes";"no";"unknown";6;"may";65;1;-1;0;"unknown";"no"
-48;"management";"divorced";"tertiary";"no";1328;"yes";"no";"unknown";6;"may";339;1;-1;0;"unknown";"no"
-55;"services";"married";"primary";"no";255;"yes";"no";"unknown";6;"may";285;1;-1;0;"unknown";"no"
-53;"blue-collar";"married";"secondary";"no";3397;"yes";"no";"unknown";6;"may";231;1;-1;0;"unknown";"no"
-47;"technician";"married";"unknown";"no";2106;"yes";"no";"unknown";6;"may";168;1;-1;0;"unknown";"no"
-39;"management";"married";"tertiary";"no";2877;"yes";"no";"unknown";6;"may";278;1;-1;0;"unknown";"no"
-31;"blue-collar";"single";"tertiary";"no";60;"yes";"yes";"unknown";6;"may";389;1;-1;0;"unknown";"no"
-39;"blue-collar";"married";"primary";"no";2226;"yes";"no";"unknown";6;"may";158;1;-1;0;"unknown";"no"
-40;"blue-collar";"married";"primary";"no";2880;"yes";"no";"unknown";6;"may";145;2;-1;0;"unknown";"no"
-40;"technician";"single";"unknown";"no";-5;"yes";"no";"unknown";6;"may";78;2;-1;0;"unknown";"no"
-48;"technician";"married";"secondary";"no";147;"no";"no";"unknown";6;"may";142;3;-1;0;"unknown";"no"
-33;"technician";"divorced";"secondary";"no";7;"yes";"yes";"unknown";6;"may";87;1;-1;0;"unknown";"no"
-40;"technician";"married";"secondary";"no";109;"yes";"no";"unknown";6;"may";147;2;-1;0;"unknown";"no"
-59;"retired";"married";"primary";"no";-119;"yes";"no";"unknown";6;"may";289;1;-1;0;"unknown";"no"
-30;"technician";"married";"secondary";"no";484;"yes";"no";"unknown";6;"may";703;1;-1;0;"unknown";"yes"
-31;"management";"single";"tertiary";"no";1852;"yes";"no";"unknown";6;"may";170;3;-1;0;"unknown";"no"
-35;"unemployed";"married";"secondary";"no";533;"yes";"no";"unknown";6;"may";802;1;-1;0;"unknown";"no"
-54;"technician";"divorced";"secondary";"no";21;"yes";"no";"unknown";6;"may";381;2;-1;0;"unknown";"no"
-34;"admin.";"single";"unknown";"no";2434;"yes";"no";"unknown";6;"may";218;4;-1;0;"unknown";"no"
-32;"technician";"married";"secondary";"no";90;"yes";"yes";"unknown";6;"may";57;2;-1;0;"unknown";"no"
-56;"admin.";"divorced";"unknown";"no";4246;"yes";"no";"unknown";6;"may";304;2;-1;0;"unknown";"no"
-32;"admin.";"single";"tertiary";"no";395;"yes";"no";"unknown";6;"may";241;3;-1;0;"unknown";"no"
-42;"blue-collar";"married";"primary";"no";15;"yes";"no";"unknown";6;"may";230;1;-1;0;"unknown";"no"
-33;"services";"married";"tertiary";"no";85;"no";"no";"unknown";6;"may";262;3;-1;0;"unknown";"no"
-52;"entrepreneur";"married";"tertiary";"no";-184;"yes";"yes";"unknown";6;"may";392;2;-1;0;"unknown";"no"
-52;"services";"married";"secondary";"no";660;"no";"no";"unknown";6;"may";201;2;-1;0;"unknown";"no"
-52;"blue-collar";"divorced";"primary";"yes";-183;"yes";"no";"unknown";6;"may";145;1;-1;0;"unknown";"no"
-30;"unemployed";"divorced";"secondary";"no";1144;"yes";"no";"unknown";6;"may";252;1;-1;0;"unknown";"no"
-44;"services";"divorced";"secondary";"no";1;"yes";"no";"unknown";6;"may";235;4;-1;0;"unknown";"no"
-35;"admin.";"married";"secondary";"no";69;"yes";"yes";"unknown";6;"may";235;2;-1;0;"unknown";"no"
-55;"management";"single";"secondary";"no";220;"yes";"no";"unknown";6;"may";328;2;-1;0;"unknown";"no"
-33;"blue-collar";"married";"primary";"no";332;"yes";"no";"unknown";6;"may";116;2;-1;0;"unknown";"no"
-37;"blue-collar";"single";"secondary";"no";240;"yes";"no";"unknown";6;"may";246;1;-1;0;"unknown";"no"
-42;"technician";"single";"secondary";"no";0;"yes";"no";"unknown";6;"may";293;1;-1;0;"unknown";"no"
-43;"unemployed";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";37;2;-1;0;"unknown";"no"
-38;"entrepreneur";"married";"tertiary";"no";898;"yes";"no";"unknown";6;"may";132;2;-1;0;"unknown";"no"
-37;"technician";"married";"secondary";"no";123;"yes";"yes";"unknown";6;"may";530;2;-1;0;"unknown";"no"
-31;"student";"single";"secondary";"no";252;"yes";"no";"unknown";6;"may";175;3;-1;0;"unknown";"no"
-41;"management";"married";"tertiary";"no";65;"yes";"no";"unknown";6;"may";524;2;-1;0;"unknown";"no"
-41;"technician";"married";"secondary";"no";-366;"yes";"yes";"unknown";6;"may";29;3;-1;0;"unknown";"no"
-29;"student";"single";"secondary";"no";209;"yes";"no";"unknown";6;"may";311;2;-1;0;"unknown";"no"
-38;"admin.";"single";"secondary";"no";221;"yes";"no";"unknown";6;"may";211;2;-1;0;"unknown";"no"
-44;"self-employed";"divorced";"tertiary";"no";4;"yes";"no";"unknown";6;"may";312;3;-1;0;"unknown";"no"
-39;"admin.";"married";"secondary";"no";104;"yes";"no";"unknown";6;"may";412;1;-1;0;"unknown";"no"
-28;"technician";"single";"secondary";"no";312;"yes";"no";"unknown";6;"may";392;1;-1;0;"unknown";"no"
-33;"blue-collar";"married";"secondary";"no";-349;"yes";"no";"unknown";6;"may";191;1;-1;0;"unknown";"no"
-41;"services";"married";"unknown";"no";4;"no";"no";"unknown";6;"may";284;2;-1;0;"unknown";"no"
-40;"blue-collar";"married";"primary";"no";-322;"yes";"yes";"unknown";6;"may";144;1;-1;0;"unknown";"no"
-29;"admin.";"married";"secondary";"no";-150;"yes";"no";"unknown";6;"may";328;1;-1;0;"unknown";"no"
-38;"management";"married";"unknown";"no";1349;"yes";"no";"unknown";6;"may";100;1;-1;0;"unknown";"no"
-32;"admin.";"married";"tertiary";"no";281;"yes";"no";"unknown";6;"may";226;1;-1;0;"unknown";"no"
-45;"services";"married";"secondary";"no";1259;"yes";"no";"unknown";6;"may";507;1;-1;0;"unknown";"no"
-33;"admin.";"single";"secondary";"no";101;"yes";"no";"unknown";6;"may";392;1;-1;0;"unknown";"no"
-34;"blue-collar";"married";"secondary";"no";848;"yes";"no";"unknown";6;"may";684;2;-1;0;"unknown";"no"
-41;"entrepreneur";"married";"unknown";"no";89;"yes";"no";"unknown";6;"may";333;2;-1;0;"unknown";"no"
-41;"blue-collar";"married";"secondary";"no";140;"yes";"no";"unknown";6;"may";311;3;-1;0;"unknown";"no"
-35;"admin.";"single";"secondary";"no";148;"yes";"no";"unknown";6;"may";128;2;-1;0;"unknown";"no"
-40;"technician";"single";"secondary";"no";200;"yes";"no";"unknown";6;"may";322;2;-1;0;"unknown";"no"
-60;"self-employed";"married";"primary";"no";46;"yes";"no";"unknown";6;"may";202;4;-1;0;"unknown";"no"
-47;"services";"divorced";"secondary";"no";201;"yes";"no";"unknown";6;"may";92;2;-1;0;"unknown";"no"
-46;"blue-collar";"married";"primary";"no";530;"yes";"no";"unknown";6;"may";739;3;-1;0;"unknown";"no"
-31;"blue-collar";"single";"secondary";"no";0;"yes";"no";"unknown";6;"may";273;2;-1;0;"unknown";"no"
-49;"self-employed";"married";"secondary";"no";1;"yes";"no";"unknown";6;"may";260;3;-1;0;"unknown";"no"
-29;"blue-collar";"married";"secondary";"no";43;"yes";"no";"unknown";6;"may";268;2;-1;0;"unknown";"no"
-31;"management";"single";"tertiary";"no";-173;"yes";"no";"unknown";6;"may";396;2;-1;0;"unknown";"no"
-38;"management";"married";"tertiary";"no";389;"yes";"no";"unknown";6;"may";262;1;-1;0;"unknown";"no"
-37;"blue-collar";"married";"primary";"no";215;"yes";"yes";"unknown";6;"may";308;3;-1;0;"unknown";"no"
-35;"technician";"married";"secondary";"no";-131;"yes";"no";"unknown";6;"may";467;2;-1;0;"unknown";"no"
-31;"management";"single";"secondary";"no";783;"yes";"no";"unknown";6;"may";320;1;-1;0;"unknown";"no"
-41;"admin.";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";160;3;-1;0;"unknown";"no"
-46;"services";"married";"unknown";"no";80;"yes";"no";"unknown";6;"may";245;2;-1;0;"unknown";"no"
-40;"services";"divorced";"secondary";"no";105;"yes";"no";"unknown";6;"may";189;2;-1;0;"unknown";"no"
-29;"admin.";"married";"secondary";"no";182;"yes";"yes";"unknown";6;"may";477;1;-1;0;"unknown";"no"
-49;"admin.";"married";"secondary";"no";82;"yes";"no";"unknown";6;"may";310;1;-1;0;"unknown";"no"
-42;"management";"married";"tertiary";"no";0;"yes";"no";"unknown";6;"may";65;3;-1;0;"unknown";"no"
-54;"services";"married";"secondary";"no";510;"yes";"no";"unknown";6;"may";196;2;-1;0;"unknown";"no"
-40;"management";"single";"tertiary";"no";242;"yes";"no";"unknown";6;"may";221;2;-1;0;"unknown";"no"
-53;"admin.";"married";"secondary";"no";244;"yes";"yes";"unknown";6;"may";197;2;-1;0;"unknown";"no"
-49;"management";"married";"tertiary";"no";92;"yes";"no";"unknown";6;"may";221;2;-1;0;"unknown";"no"
-40;"blue-collar";"married";"primary";"yes";0;"yes";"no";"unknown";6;"may";64;2;-1;0;"unknown";"no"
-29;"student";"single";"secondary";"no";948;"yes";"no";"unknown";6;"may";75;2;-1;0;"unknown";"no"
-36;"blue-collar";"married";"primary";"no";23;"yes";"no";"unknown";6;"may";400;2;-1;0;"unknown";"no"
-37;"technician";"married";"secondary";"no";710;"yes";"no";"unknown";6;"may";378;3;-1;0;"unknown";"no"
-39;"services";"married";"secondary";"no";1205;"yes";"no";"unknown";6;"may";118;2;-1;0;"unknown";"no"
-36;"technician";"married";"secondary";"no";368;"yes";"yes";"unknown";6;"may";1597;2;-1;0;"unknown";"yes"
-44;"entrepreneur";"married";"tertiary";"no";1631;"yes";"no";"unknown";6;"may";346;2;-1;0;"unknown";"no"
-40;"admin.";"married";"secondary";"no";6;"yes";"no";"unknown";6;"may";60;3;-1;0;"unknown";"no"
-49;"blue-collar";"married";"secondary";"no";26;"yes";"no";"unknown";6;"may";276;2;-1;0;"unknown";"no"
-30;"technician";"single";"unknown";"no";-48;"yes";"no";"unknown";6;"may";152;2;-1;0;"unknown";"no"
-57;"management";"married";"tertiary";"no";2142;"yes";"no";"unknown";6;"may";251;3;-1;0;"unknown";"no"
-24;"services";"single";"secondary";"no";77;"yes";"yes";"unknown";6;"may";390;2;-1;0;"unknown";"no"
-46;"blue-collar";"married";"unknown";"no";401;"yes";"no";"unknown";6;"may";306;2;-1;0;"unknown";"no"
-33;"admin.";"married";"secondary";"no";21;"no";"no";"unknown";6;"may";189;3;-1;0;"unknown";"no"
-43;"services";"divorced";"secondary";"no";0;"yes";"no";"unknown";6;"may";125;2;-1;0;"unknown";"no"
-43;"admin.";"single";"secondary";"no";-497;"yes";"no";"unknown";6;"may";234;2;-1;0;"unknown";"no"
-40;"blue-collar";"divorced";"primary";"no";369;"no";"no";"unknown";6;"may";79;2;-1;0;"unknown";"no"
-44;"technician";"single";"unknown";"no";78;"yes";"no";"unknown";6;"may";13;6;-1;0;"unknown";"no"
-35;"technician";"single";"tertiary";"no";226;"yes";"yes";"unknown";6;"may";283;3;-1;0;"unknown";"no"
-47;"technician";"married";"secondary";"no";503;"yes";"no";"unknown";6;"may";109;2;-1;0;"unknown";"no"
-33;"blue-collar";"married";"secondary";"no";372;"yes";"no";"unknown";6;"may";132;2;-1;0;"unknown";"no"
-31;"admin.";"married";"secondary";"no";0;"yes";"yes";"unknown";6;"may";144;2;-1;0;"unknown";"no"
-40;"blue-collar";"divorced";"secondary";"no";0;"yes";"no";"unknown";6;"may";121;2;-1;0;"unknown";"no"
-36;"entrepreneur";"married";"tertiary";"no";125;"yes";"no";"unknown";6;"may";95;3;-1;0;"unknown";"no"
-56;"retired";"divorced";"primary";"no";4;"yes";"no";"unknown";6;"may";31;3;-1;0;"unknown";"no"
-40;"admin.";"single";"unknown";"no";419;"yes";"no";"unknown";6;"may";112;3;-1;0;"unknown";"no"
-41;"admin.";"divorced";"secondary";"no";322;"yes";"no";"unknown";6;"may";87;4;-1;0;"unknown";"no"
-53;"retired";"married";"secondary";"no";303;"yes";"no";"unknown";6;"may";593;2;-1;0;"unknown";"no"
-39;"blue-collar";"married";"secondary";"no";607;"yes";"no";"unknown";6;"may";99;2;-1;0;"unknown";"no"
-44;"blue-collar";"divorced";"secondary";"no";579;"yes";"no";"unknown";6;"may";198;2;-1;0;"unknown";"no"
-38;"admin.";"married";"secondary";"no";3047;"yes";"no";"unknown";6;"may";285;2;-1;0;"unknown";"no"
-54;"technician";"divorced";"secondary";"no";83;"yes";"no";"unknown";6;"may";190;3;-1;0;"unknown";"no"
-58;"management";"married";"tertiary";"no";68;"yes";"no";"unknown";6;"may";172;5;-1;0;"unknown";"no"
-52;"blue-collar";"married";"primary";"no";58;"yes";"no";"unknown";6;"may";213;3;-1;0;"unknown";"no"
-28;"admin.";"single";"secondary";"no";251;"yes";"no";"unknown";6;"may";178;2;-1;0;"unknown";"no"
-36;"blue-collar";"married";"secondary";"no";688;"yes";"no";"unknown";6;"may";174;2;-1;0;"unknown";"no"
-60;"retired";"married";"primary";"no";364;"yes";"no";"unknown";6;"may";631;2;-1;0;"unknown";"no"
-42;"services";"divorced";"secondary";"no";55;"yes";"no";"unknown";6;"may";176;5;-1;0;"unknown";"no"
-42;"admin.";"married";"secondary";"no";101;"yes";"no";"unknown";6;"may";32;3;-1;0;"unknown";"no"
-44;"management";"married";"tertiary";"no";105;"yes";"no";"unknown";6;"may";1529;2;-1;0;"unknown";"no"
-51;"blue-collar";"divorced";"primary";"no";325;"yes";"no";"unknown";6;"may";254;2;-1;0;"unknown";"no"
-49;"blue-collar";"married";"primary";"no";198;"yes";"no";"unknown";6;"may";200;2;-1;0;"unknown";"no"
-47;"entrepreneur";"married";"unknown";"no";209;"yes";"no";"unknown";6;"may";135;2;-1;0;"unknown";"no"
-37;"blue-collar";"married";"secondary";"no";183;"yes";"no";"unknown";6;"may";112;4;-1;0;"unknown";"no"
-34;"management";"married";"tertiary";"no";105;"yes";"no";"unknown";6;"may";314;3;-1;0;"unknown";"no"
-35;"services";"married";"secondary";"no";109;"yes";"no";"unknown";6;"may";597;3;-1;0;"unknown";"no"
-35;"blue-collar";"single";"secondary";"no";376;"yes";"yes";"unknown";6;"may";207;3;-1;0;"unknown";"no"
-40;"blue-collar";"married";"primary";"no";-7;"yes";"no";"unknown";6;"may";410;2;-1;0;"unknown";"no"
-55;"technician";"married";"secondary";"no";0;"no";"no";"unknown";6;"may";160;3;-1;0;"unknown";"no"
-55;"retired";"married";"secondary";"no";143;"yes";"no";"unknown";6;"may";42;3;-1;0;"unknown";"no"
-35;"management";"single";"tertiary";"no";550;"yes";"no";"unknown";6;"may";55;2;-1;0;"unknown";"no"
-57;"blue-collar";"married";"primary";"no";162;"yes";"no";"unknown";6;"may";155;2;-1;0;"unknown";"no"
-53;"management";"married";"tertiary";"no";115;"yes";"no";"unknown";6;"may";336;3;-1;0;"unknown";"no"
-41;"blue-collar";"married";"primary";"no";512;"yes";"no";"unknown";6;"may";233;2;-1;0;"unknown";"no"
-57;"blue-collar";"married";"unknown";"no";807;"yes";"no";"unknown";6;"may";211;2;-1;0;"unknown";"no"
-45;"blue-collar";"married";"unknown";"no";248;"yes";"no";"unknown";6;"may";88;5;-1;0;"unknown";"no"
-43;"blue-collar";"married";"primary";"no";1211;"yes";"no";"unknown";6;"may";208;3;-1;0;"unknown";"no"
-56;"self-employed";"married";"unknown";"no";7;"no";"no";"unknown";6;"may";305;2;-1;0;"unknown";"no"
-31;"entrepreneur";"married";"tertiary";"no";281;"yes";"no";"unknown";6;"may";206;2;-1;0;"unknown";"no"
-37;"blue-collar";"single";"secondary";"no";88;"yes";"no";"unknown";6;"may";128;2;-1;0;"unknown";"no"
-30;"management";"married";"tertiary";"no";32;"yes";"no";"unknown";6;"may";122;3;-1;0;"unknown";"no"
-30;"admin.";"single";"secondary";"no";115;"yes";"no";"unknown";6;"may";66;3;-1;0;"unknown";"no"
-54;"blue-collar";"married";"secondary";"no";254;"yes";"no";"unknown";6;"may";66;2;-1;0;"unknown";"no"
-36;"management";"married";"tertiary";"no";144;"yes";"no";"unknown";6;"may";164;2;-1;0;"unknown";"no"
-55;"unemployed";"married";"tertiary";"no";383;"no";"no";"unknown";6;"may";343;3;-1;0;"unknown";"no"
-37;"admin.";"single";"secondary";"no";569;"yes";"yes";"unknown";6;"may";126;2;-1;0;"unknown";"no"
-38;"housemaid";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";59;3;-1;0;"unknown";"no"
-48;"admin.";"married";"secondary";"no";3754;"yes";"no";"unknown";6;"may";249;3;-1;0;"unknown";"no"
-55;"housemaid";"divorced";"tertiary";"no";6920;"yes";"no";"unknown";6;"may";406;3;-1;0;"unknown";"no"
-59;"services";"married";"secondary";"no";307;"yes";"yes";"unknown";6;"may";250;7;-1;0;"unknown";"no"
-37;"technician";"married";"secondary";"no";-421;"yes";"no";"unknown";6;"may";183;5;-1;0;"unknown";"no"
-33;"blue-collar";"divorced";"secondary";"no";60;"no";"no";"unknown";6;"may";190;3;-1;0;"unknown";"no"
-44;"blue-collar";"married";"primary";"no";67;"yes";"no";"unknown";6;"may";220;2;-1;0;"unknown";"no"
-57;"technician";"married";"secondary";"no";402;"yes";"no";"unknown";6;"may";153;3;-1;0;"unknown";"no"
-30;"self-employed";"single";"tertiary";"no";800;"no";"no";"unknown";6;"may";95;2;-1;0;"unknown";"no"
-42;"technician";"married";"tertiary";"no";239;"yes";"yes";"unknown";6;"may";191;3;-1;0;"unknown";"no"
-51;"blue-collar";"divorced";"secondary";"no";421;"yes";"no";"unknown";6;"may";216;2;-1;0;"unknown";"no"
-44;"admin.";"divorced";"secondary";"no";161;"yes";"no";"unknown";7;"may";89;2;-1;0;"unknown";"no"
-46;"technician";"married";"secondary";"yes";289;"no";"no";"unknown";7;"may";51;3;-1;0;"unknown";"no"
-29;"student";"single";"secondary";"no";110;"yes";"no";"unknown";7;"may";169;3;-1;0;"unknown";"no"
-39;"blue-collar";"married";"primary";"no";245;"yes";"no";"unknown";7;"may";148;3;-1;0;"unknown";"no"
-42;"services";"married";"secondary";"no";0;"yes";"no";"unknown";7;"may";132;3;-1;0;"unknown";"no"
-50;"blue-collar";"married";"primary";"no";156;"yes";"no";"unknown";7;"may";117;3;-1;0;"unknown";"no"
-42;"technician";"single";"secondary";"no";0;"yes";"no";"unknown";7;"may";275;4;-1;0;"unknown";"no"
-39;"admin.";"married";"secondary";"no";20;"yes";"no";"unknown";7;"may";124;2;-1;0;"unknown";"no"
-55;"technician";"single";"tertiary";"no";92;"yes";"no";"unknown";7;"may";118;3;-1;0;"unknown";"no"
-46;"services";"married";"secondary";"no";89;"yes";"no";"unknown";7;"may";479;2;-1;0;"unknown";"no"
-42;"blue-collar";"married";"secondary";"no";166;"yes";"no";"unknown";7;"may";285;3;-1;0;"unknown";"no"
-45;"management";"married";"tertiary";"no";103;"yes";"no";"unknown";7;"may";35;4;-1;0;"unknown";"no"
-43;"blue-collar";"married";"primary";"no";-454;"yes";"no";"unknown";7;"may";322;2;-1;0;"unknown";"no"
-42;"admin.";"married";"secondary";"no";445;"yes";"no";"unknown";7;"may";202;2;-1;0;"unknown";"no"
-30;"admin.";"married";"secondary";"no";4;"no";"no";"unknown";7;"may";172;8;-1;0;"unknown";"no"
-47;"blue-collar";"married";"secondary";"no";1001;"yes";"no";"unknown";7;"may";201;4;-1;0;"unknown";"no"
-51;"services";"divorced";"secondary";"no";-69;"yes";"no";"unknown";7;"may";216;3;-1;0;"unknown";"no"
-38;"technician";"single";"secondary";"no";42;"yes";"no";"unknown";7;"may";195;2;-1;0;"unknown";"no"
-57;"technician";"married";"unknown";"no";1617;"yes";"no";"unknown";7;"may";96;2;-1;0;"unknown";"no"
-42;"management";"divorced";"tertiary";"no";221;"yes";"no";"unknown";7;"may";720;2;-1;0;"unknown";"no"
-32;"technician";"divorced";"secondary";"no";210;"yes";"yes";"unknown";7;"may";188;2;-1;0;"unknown";"no"
-46;"management";"married";"tertiary";"no";0;"no";"no";"unknown";7;"may";70;2;-1;0;"unknown";"no"
-29;"student";"single";"tertiary";"no";185;"yes";"no";"unknown";7;"may";141;3;-1;0;"unknown";"no"
-59;"retired";"married";"secondary";"no";836;"yes";"no";"unknown";7;"may";106;1;-1;0;"unknown";"no"
-32;"blue-collar";"single";"secondary";"no";301;"yes";"no";"unknown";7;"may";395;2;-1;0;"unknown";"no"
-44;"blue-collar";"married";"primary";"no";503;"yes";"no";"unknown";7;"may";629;2;-1;0;"unknown";"no"
-40;"retired";"married";"primary";"no";407;"yes";"no";"unknown";7;"may";502;1;-1;0;"unknown";"no"
-31;"blue-collar";"single";"secondary";"no";53;"yes";"no";"unknown";7;"may";446;1;-1;0;"unknown";"no"
-46;"self-employed";"married";"tertiary";"no";2303;"yes";"no";"unknown";7;"may";241;1;-1;0;"unknown";"no"
-43;"management";"married";"tertiary";"no";144;"yes";"no";"unknown";7;"may";131;3;-1;0;"unknown";"no"
-34;"services";"married";"secondary";"no";205;"yes";"no";"unknown";7;"may";312;1;-1;0;"unknown";"no"
-39;"management";"married";"tertiary";"no";305;"yes";"no";"unknown";7;"may";275;6;-1;0;"unknown";"no"
-30;"blue-collar";"divorced";"secondary";"no";251;"yes";"yes";"unknown";7;"may";120;2;-1;0;"unknown";"no"
-56;"retired";"married";"secondary";"no";0;"yes";"no";"unknown";7;"may";333;4;-1;0;"unknown";"no"
-29;"technician";"married";"secondary";"no";8;"no";"no";"unknown";7;"may";113;1;-1;0;"unknown";"no"
-40;"blue-collar";"divorced";"secondary";"no";139;"yes";"no";"unknown";7;"may";91;1;-1;0;"unknown";"no"
-36;"services";"married";"secondary";"no";184;"yes";"no";"unknown";7;"may";128;3;-1;0;"unknown";"no"
-37;"blue-collar";"single";"secondary";"no";238;"yes";"no";"unknown";7;"may";200;2;-1;0;"unknown";"no"
-35;"admin.";"married";"secondary";"no";0;"no";"no";"unknown";7;"may";326;1;-1;0;"unknown";"no"
-35;"blue-collar";"married";"primary";"yes";0;"yes";"no";"unknown";7;"may";292;1;-1;0;"unknown";"no"
-47;"services";"married";"primary";"no";222;"yes";"no";"unknown";7;"may";68;1;-1;0;"unknown";"no"
-31;"services";"married";"secondary";"no";414;"yes";"no";"unknown";7;"may";215;1;-1;0;"unknown";"no"
-56;"retired";"single";"primary";"no";223;"yes";"no";"unknown";7;"may";97;1;-1;0;"unknown";"no"
-57;"technician";"married";"secondary";"no";197;"no";"no";"unknown";7;"may";32;1;-1;0;"unknown";"no"
-36;"blue-collar";"married";"secondary";"no";-251;"yes";"no";"unknown";7;"may";162;1;-1;0;"unknown";"no"
-45;"self-employed";"divorced";"secondary";"no";-139;"yes";"no";"unknown";7;"may";152;3;-1;0;"unknown";"no"
-47;"blue-collar";"married";"unknown";"no";733;"yes";"no";"unknown";7;"may";268;1;-1;0;"unknown";"no"
-29;"technician";"single";"tertiary";"no";0;"yes";"no";"unknown";7;"may";104;2;-1;0;"unknown";"no"
-57;"services";"married";"secondary";"no";1;"no";"no";"unknown";7;"may";852;1;-1;0;"unknown";"no"
-45;"blue-collar";"married";"primary";"no";97;"yes";"no";"unknown";7;"may";923;3;-1;0;"unknown";"no"
-31;"blue-collar";"single";"primary";"no";435;"yes";"no";"unknown";7;"may";159;2;-1;0;"unknown";"no"
-31;"management";"divorced";"tertiary";"no";0;"yes";"no";"unknown";7;"may";953;3;-1;0;"unknown";"no"
-37;"technician";"single";"tertiary";"no";147;"no";"no";"unknown";7;"may";416;2;-1;0;"unknown";"no"
-30;"technician";"single";"tertiary";"no";3;"yes";"no";"unknown";7;"may";174;1;-1;0;"unknown";"no"
-58;"services";"divorced";"secondary";"no";1109;"yes";"yes";"unknown";7;"may";180;1;-1;0;"unknown";"no"
-33;"services";"married";"secondary";"no";404;"yes";"no";"unknown";7;"may";139;1;-1;0;"unknown";"no"
-39;"blue-collar";"married";"primary";"no";981;"yes";"no";"unknown";7;"may";294;1;-1;0;"unknown";"no"
-33;"blue-collar";"single";"primary";"no";95;"yes";"no";"unknown";7;"may";102;1;-1;0;"unknown";"no"
-34;"services";"married";"secondary";"no";302;"yes";"no";"unknown";7;"may";124;1;-1;0;"unknown";"no"
-36;"services";"divorced";"secondary";"no";-290;"yes";"yes";"unknown";7;"may";128;1;-1;0;"unknown";"no"
-37;"services";"single";"secondary";"no";259;"yes";"no";"unknown";7;"may";130;1;-1;0;"unknown";"no"
-35;"blue-collar";"married";"secondary";"no";527;"yes";"yes";"unknown";7;"may";143;1;-1;0;"unknown";"no"
-55;"retired";"married";"secondary";"no";102;"yes";"no";"unknown";7;"may";74;1;-1;0;"unknown";"no"
-34;"management";"single";"tertiary";"no";872;"yes";"no";"unknown";7;"may";105;2;-1;0;"unknown";"no"
-40;"management";"divorced";"tertiary";"no";490;"yes";"no";"unknown";7;"may";477;2;-1;0;"unknown";"no"
-42;"blue-collar";"single";"primary";"no";19;"yes";"no";"unknown";7;"may";158;1;-1;0;"unknown";"no"
-37;"blue-collar";"married";"secondary";"no";16;"yes";"no";"unknown";7;"may";250;1;-1;0;"unknown";"no"
-42;"management";"married";"tertiary";"no";386;"yes";"no";"unknown";7;"may";168;1;-1;0;"unknown";"no"
-35;"technician";"single";"secondary";"no";539;"yes";"no";"unknown";7;"may";520;1;-1;0;"unknown";"no"
-44;"technician";"divorced";"secondary";"no";-329;"yes";"no";"unknown";7;"may";171;1;-1;0;"unknown";"no"
-30;"services";"single";"secondary";"no";-174;"yes";"no";"unknown";7;"may";113;1;-1;0;"unknown";"no"
-45;"entrepreneur";"married";"secondary";"no";68;"yes";"no";"unknown";7;"may";254;1;-1;0;"unknown";"no"
-35;"blue-collar";"single";"unknown";"yes";-532;"yes";"no";"unknown";7;"may";149;1;-1;0;"unknown";"no"
-36;"admin.";"divorced";"secondary";"no";0;"yes";"no";"unknown";7;"may";133;2;-1;0;"unknown";"no"
-49;"blue-collar";"married";"secondary";"no";64;"yes";"no";"unknown";7;"may";293;3;-1;0;"unknown";"no"
-31;"blue-collar";"single";"secondary";"no";1415;"yes";"no";"unknown";7;"may";485;1;-1;0;"unknown";"no"
-31;"technician";"single";"secondary";"no";147;"yes";"no";"unknown";7;"may";374;1;-1;0;"unknown";"no"
-39;"blue-collar";"married";"secondary";"no";72;"yes";"no";"unknown";7;"may";425;6;-1;0;"unknown";"no"
-37;"services";"single";"secondary";"no";-196;"yes";"no";"unknown";7;"may";207;1;-1;0;"unknown";"no"
-33;"blue-collar";"married";"primary";"no";716;"yes";"no";"unknown";7;"may";83;3;-1;0;"unknown";"no"
-37;"management";"married";"tertiary";"no";0;"yes";"no";"unknown";7;"may";228;1;-1;0;"unknown";"no"
-42;"services";"married";"secondary";"no";-246;"yes";"no";"unknown";7;"may";149;1;-1;0;"unknown";"no"
-56;"blue-collar";"married";"secondary";"no";-203;"yes";"no";"unknown";7;"may";139;1;-1;0;"unknown";"no"
-37;"admin.";"single";"secondary";"no";245;"yes";"yes";"unknown";7;"may";732;2;-1;0;"unknown";"yes"
-36;"services";"single";"secondary";"no";342;"yes";"no";"unknown";7;"may";142;1;-1;0;"unknown";"no"
-29;"technician";"single";"tertiary";"no";3;"yes";"no";"unknown";7;"may";359;1;-1;0;"unknown";"no"
-54;"management";"married";"tertiary";"yes";-248;"yes";"yes";"unknown";7;"may";112;1;-1;0;"unknown";"no"
-38;"blue-collar";"married";"secondary";"no";376;"yes";"no";"unknown";7;"may";1521;1;-1;0;"unknown";"no"
-43;"blue-collar";"divorced";"secondary";"no";370;"yes";"no";"unknown";7;"may";216;1;-1;0;"unknown";"no"
-47;"admin.";"single";"secondary";"no";594;"yes";"no";"unknown";7;"may";161;1;-1;0;"unknown";"no"
-47;"blue-collar";"married";"secondary";"no";387;"yes";"no";"unknown";7;"may";122;2;-1;0;"unknown";"no"
-38;"services";"married";"secondary";"no";208;"yes";"no";"unknown";7;"may";800;1;-1;0;"unknown";"no"
-40;"blue-collar";"married";"secondary";"no";563;"yes";"no";"unknown";7;"may";615;1;-1;0;"unknown";"no"
-33;"services";"divorced";"secondary";"no";392;"yes";"yes";"unknown";7;"may";254;1;-1;0;"unknown";"no"
-33;"retired";"married";"secondary";"no";165;"no";"no";"unknown";7;"may";111;1;-1;0;"unknown";"no"
-53;"admin.";"divorced";"unknown";"no";236;"yes";"no";"unknown";7;"may";354;1;-1;0;"unknown";"no"
-37;"services";"married";"primary";"no";52;"yes";"no";"unknown";7;"may";359;1;-1;0;"unknown";"no"
-40;"management";"single";"tertiary";"no";1265;"yes";"no";"unknown";7;"may";97;1;-1;0;"unknown";"no"
-37;"blue-collar";"married";"primary";"no";693;"yes";"no";"unknown";7;"may";327;3;-1;0;"unknown";"no"
-35;"technician";"married";"secondary";"no";118;"yes";"no";"unknown";7;"may";236;1;-1;0;"unknown";"no"
-49;"blue-collar";"married";"primary";"no";3659;"yes";"no";"unknown";7;"may";160;1;-1;0;"unknown";"no"
-26;"blue-collar";"single";"secondary";"no";24;"yes";"no";"unknown";7;"may";180;1;-1;0;"unknown";"no"
-38;"management";"single";"tertiary";"no";673;"yes";"no";"unknown";7;"may";184;1;-1;0;"unknown";"no"
-52;"self-employed";"married";"secondary";"no";273;"no";"no";"unknown";7;"may";227;1;-1;0;"unknown";"no"
-33;"services";"divorced";"secondary";"no";327;"yes";"no";"unknown";7;"may";109;1;-1;0;"unknown";"no"
-31;"admin.";"single";"secondary";"no";299;"yes";"no";"unknown";7;"may";492;2;-1;0;"unknown";"no"
-32;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";7;"may";298;1;-1;0;"unknown";"no"
-35;"blue-collar";"single";"primary";"no";109;"yes";"no";"unknown";7;"may";83;2;-1;0;"unknown";"no"
-55;"management";"divorced";"tertiary";"no";552;"no";"no";"unknown";7;"may";241;2;-1;0;"unknown";"no"
-32;"blue-collar";"divorced";"primary";"no";473;"yes";"no";"unknown";7;"may";204;2;-1;0;"unknown";"no"
-37;"unknown";"single";"unknown";"no";414;"yes";"no";"unknown";7;"may";131;1;-1;0;"unknown";"no"
-45;"blue-collar";"married";

<TRUNCATED>
r***@apache.org
2018-06-28 14:55:09 UTC
Permalink
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/resources/cf-data-purchase.txt
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/resources/cf-data-purchase.txt b/community/mahout-mr/examples/src/main/resources/cf-data-purchase.txt
deleted file mode 100644
index d87c031..0000000
--- a/community/mahout-mr/examples/src/main/resources/cf-data-purchase.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-u1,iphone
-u1,ipad
-u2,nexus
-u2,galaxy
-u3,surface
-u4,iphone
-u4,galaxy

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/resources/cf-data-view.txt
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/resources/cf-data-view.txt b/community/mahout-mr/examples/src/main/resources/cf-data-view.txt
deleted file mode 100644
index 09ad9b6..0000000
--- a/community/mahout-mr/examples/src/main/resources/cf-data-view.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-u1,ipad
-u1,nexus
-u1,galaxy
-u2,iphone
-u2,ipad
-u2,nexus
-u2,galaxy
-u3,surface
-u3,nexus
-u4,iphone
-u4,ipad
-u4,galaxy

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/resources/donut-test.csv
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/resources/donut-test.csv b/community/mahout-mr/examples/src/main/resources/donut-test.csv
deleted file mode 100644
index 46ea564..0000000
--- a/community/mahout-mr/examples/src/main/resources/donut-test.csv
+++ /dev/null
@@ -1,41 +0,0 @@
-"x","y","shape","color","xx","xy","yy","c","a","b"
-0.802415437065065,0.0978854028508067,21,2,0.643870533640319,0.07854475831082,0.00958155209126472,0.503141377562721,0.808363832523192,0.220502180491382
-0.97073650965467,0.989339149091393,23,2,0.942329371176533,0.96038763245370,0.978791951924881,0.67900343471543,1.38604520961670,0.989771844311643
-0.566630310611799,0.369259539060295,25,1,0.321069908904024,0.209233647314105,0.136352607187021,0.146740132271139,0.676330182744379,0.569352171215186
-0.377948862500489,0.500907538458705,24,1,0.142845342665413,0.189317434378387,0.250908362084759,0.122054511555201,0.62749797190921,0.79865886318828
-0.0133881184738129,0.269793515326455,25,2,0.000179241716268851,0.00361202754665705,0.0727885409122062,0.538317888266967,0.270125494221621,1.02283505301727
-0.395229484187439,0.385281964903697,25,1,0.156206345171069,0.152274792255611,0.148442192480054,0.155361155247979,0.551949760078871,0.717070128562224
-0.757145672803745,0.416044564917684,21,1,0.573269569845435,0.315006342020941,0.173093079997545,0.270503996498299,0.863922826323613,0.481737796145881
-0.589166145538911,0.971624446567148,24,2,0.347116747049177,0.572448230095344,0.944054065166917,0.479979395505718,1.13629697360157,1.05491161769044
-0.843438957352191,0.218833807157353,25,2,0.711389274779351,0.184572958142208,0.0478882351549814,0.443852166182378,0.871365313708512,0.269071728782402
-0.628562391968444,0.801476288354024,25,2,0.395090680597092,0.503777852913796,0.642364240793743,0.327744170151609,1.01855531091386,0.8833629703887
-0.262267543468624,0.247060472844169,22,2,0.0687842643570668,0.0647959433010369,0.0610388772419841,0.347124077652729,0.360309785599907,0.778002605819416
-0.738417695043609,0.562460686312988,21,1,0.545260692353516,0.415330923539883,0.316362023647678,0.246463657857698,0.928236347058869,0.620312280963368
-0.498857178725302,0.164454092038795,21,1,0.248858484765768,0.0820391043843046,0.0270451483883046,0.335547854098302,0.525265297877247,0.527436513434051
-0.499293045606464,0.733599063009024,25,1,0.249293545390979,0.366280910423824,0.538167585247717,0.233600132755117,0.88739006679064,0.888186376514393
-0.553942533675581,0.548312899889424,24,1,0.306852330614922,0.303733837011753,0.30064703618515,0.0724150069741539,0.779422457207946,0.706833997094728
-0.661088703200221,0.98143746308051,24,2,0.43703827349895,0.64881721974001,0.963219493937908,0.507672730364875,1.1833248782295,1.03830648704340
-0.492181566543877,0.376017479225993,23,1,0.242242694445585,0.185068871973329,0.141389144683470,0.124228794404457,0.619380205632255,0.63187712891139
-0.991064163157716,0.216620326042175,21,2,0.982208175495505,0.21468464215194,0.0469243656546183,0.566963889458783,1.01446170018888,0.21680455446021
-0.601602173643187,0.343355831922963,24,1,0.361925175332207,0.206563614817919,0.117893227315510,0.186709392055052,0.692689254029335,0.52594111396747
-0.0397100185509771,0.0602901463862509,25,2,0.00157688557331895,0.00239412283143915,0.00363490175127556,0.636562347604197,0.0721927096360464,0.962180726382856
-0.158290433697402,0.630195834673941,23,2,0.0250558614001118,0.0997539719848347,0.397146790040385,0.365672507948237,0.649771230080632,1.05148551299849
-0.967184047214687,0.497705311980098,25,2,0.935444981186582,0.48137263796116,0.247710577573207,0.467189682639721,1.08772954302059,0.498785990511377
-0.538070349488407,0.0130743277259171,24,2,0.289519700998577,0.00703490808881019,0.000170938045484685,0.488411672495383,0.538229169633216,0.462114639529248
-0.758642012253404,0.673675778554752,25,2,0.575537702755893,0.511078748249156,0.453839054611352,0.311542880770993,1.01458206044028,0.715606548922268
-0.986405614530668,0.981674374546856,21,2,0.972996036377624,0.9683291146939,0.96368457764196,0.684544100071034,1.39164672744903,0.981768498658543
-0.51937106740661,0.462004136526957,23,1,0.269746305659081,0.239951581534275,0.213447822168019,0.0426488439882434,0.695121664046734,0.666672328069706
-0.534244359936565,0.692785677267238,21,1,0.28541703612403,0.370116840724856,0.479951994626626,0.195803456422130,0.87485371963012,0.83479357381183
-0.0795328004751354,0.536029864801094,22,2,0.00632546635141770,0.0426319562859392,0.287328015958679,0.422008076977050,0.541898036820671,1.06517035321108
-0.330987347057089,0.804738595616072,23,2,0.10955262391189,0.266358292837412,0.647604207274128,0.348469350894533,0.870147591610767,1.04650950166343
-0.9804020607844,0.74571731640026,25,2,0.961188200790297,0.731102793761427,0.556094315979205,0.539595348001485,1.23178022259229,0.745974795285138
-0.362560331821442,0.805498170899227,21,2,0.131449994210474,0.292041684122788,0.648827303322001,0.334990738397057,0.883333061496328,1.02720817456326
-0.47635925677605,0.961423690896481,21,2,0.226918141516230,0.457983074842334,0.924335513417013,0.462028903057712,1.07296488988841,1.09477629741475
-0.850710266502574,0.635807712096721,24,2,0.723707957532881,0.540888148202193,0.404251446761667,0.376086992190972,1.06205433208219,0.65309943445803
-0.136131341336295,0.714137809583917,25,2,0.0185317420940189,0.0972165379176223,0.509992811077315,0.422203034393551,0.726996941651981,1.12083088398685
-0.930458213202655,0.865616530412808,24,2,0.865752486516278,0.805420010206583,0.749291977723908,0.564774043865972,1.27084399681479,0.868405457050378
-0.374636142514646,0.197784703457728,21,2,0.140352239278254,0.0740972983518064,0.0391187889218614,0.327185241457712,0.423640210792266,0.655895375171089
-0.482126326300204,0.841961156809703,22,1,0.232445794511731,0.405931639420132,0.708898589576332,0.342427950053959,0.970229036922758,0.988479504839456
-0.660344187868759,0.746531683253124,24,2,0.436054446452051,0.492967858096082,0.557309554100743,0.294088642131774,0.996676477375078,0.82016804669243
-0.0772640188224614,0.437956433976069,22,2,0.00596972860459766,0.0338382741581451,0.191805838061035,0.427264688298837,0.444719649515999,1.02139489377063
-0.998469967395067,0.464829172473401,25,2,0.996942275789907,0.464117968683793,0.216066159582307,0.499709210945471,1.10136662168971,0.464831690595724

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/resources/donut.csv
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/resources/donut.csv b/community/mahout-mr/examples/src/main/resources/donut.csv
deleted file mode 100644
index 33ba3b7..0000000
--- a/community/mahout-mr/examples/src/main/resources/donut.csv
+++ /dev/null
@@ -1,41 +0,0 @@
-"x","y","shape","color","k","k0","xx","xy","yy","a","b","c","bias"
-0.923307513352484,0.0135197141207755,21,2,4,8,0.852496764213146,0.0124828536260896,0.000182782669907495,0.923406490600458,0.0778750292332978,0.644866125183976,1
-0.711011884035543,0.909141522599384,22,2,3,9,0.505537899239772,0.64641042683833,0.826538308114327,1.15415605849213,0.953966686673604,0.46035073663368,1
-0.75118898646906,0.836567111080512,23,2,3,9,0.564284893392414,0.62842000028592,0.699844531341594,1.12433510339845,0.872783737128441,0.419968245447719,1
-0.308209649519995,0.418023289414123,24,1,5,1,0.094993188057238,0.128838811521522,0.174743470492603,0.519361780024138,0.808280495564412,0.208575453051705,1
-0.849057961953804,0.500220163026825,25,1,5,2,0.720899422757147,0.424715912147755,0.250220211498583,0.985454024425153,0.52249756970547,0.349058031386046,1
-0.0738831346388906,0.486534863477573,21,2,6,1,0.00545871758406844,0.0359467208248278,0.236716173379140,0.492112681164801,1.04613986717142,0.42632955896436,1
-0.612888508243486,0.0204555552918464,22,2,4,10,0.375632323536926,0.0125369747681119,0.000418429742297785,0.613229772009826,0.387651566219268,0.492652707029903,1
-0.207169560948387,0.932857288978994,23,2,1,4,0.0429192269835473,0.193259634985281,0.870222721601238,0.955584610897845,1.22425602987611,0.522604151014326,1
-0.309267645236105,0.506309477845207,24,1,5,1,0.0956464763898851,0.156585139973909,0.256349287355886,0.593292308854389,0.856423069092351,0.190836685845410,1
-0.78758287569508,0.171928803203627,25,2,4,10,0.620286786088131,0.135408181241926,0.0295595133710317,0.806130448165285,0.273277419610556,0.436273561610666,1
-0.930236018029973,0.0790199618786573,21,2,4,8,0.86533904924026,0.0735072146828825,0.00624415437530446,0.93358620577618,0.105409523078414,0.601936228937031,1
-0.238834470743313,0.623727766098455,22,1,5,1,0.0570419044152386,0.148967690904034,0.389036326202168,0.667890882268509,0.984077887735915,0.288991338582386,1
-0.83537525916472,0.802311758277938,23,2,3,7,0.697851823624524,0.670231393002335,0.643704157471036,1.15825557675997,0.819027144096042,0.451518508649315,1
-0.656760312616825,0.320640653371811,24,1,5,3,0.43133410822855,0.210584055746134,0.102810428594702,0.730851925374252,0.469706197095164,0.238209090579297,1
-0.180789119331166,0.114329558331519,25,2,2,5,0.0326847056685386,0.0206695401642766,0.0130712479082803,0.213906413126907,0.82715035810576,0.500636870310341,1
-0.990028728265315,0.061085847672075,21,2,4,8,0.980156882790638,0.0604767440857932,0.00373148078581595,0.991911469626425,0.06189432159595,0.657855445853466,1
-0.751934139290825,0.972332585137337,22,2,3,9,0.565404949831033,0.731130065509666,0.945430656119858,1.22916052895905,1.00347761677540,0.535321288127727,1
-0.136412925552577,0.552212274167687,23,2,6,1,0.0186084862578129,0.0753288918452558,0.304938395741448,0.5688118159807,1.02504684326820,0.3673168690368,1
-0.5729476721026,0.0981996888294816,24,2,4,10,0.328269034967789,0.0562632831160512,0.0096431788862070,0.581302170866406,0.43819729534628,0.408368525870829,1
-0.446335297077894,0.339370004367083,25,1,5,3,0.199215197417612,0.151472811718508,0.115171999864114,0.560702414192882,0.649397107420365,0.169357302283512,1
-0.922843366628513,0.912627586396411,21,2,3,7,0.851639879330248,0.842212314308118,0.832889111451739,1.29789405992245,0.915883320912091,0.590811338548155,1
-0.166969822719693,0.398156099021435,22,2,6,1,0.0278789216990458,0.0664800532683736,0.158528279187967,0.431749002184154,0.923291695753637,0.348254618269284,1
-0.350683249300346,0.84422400011681,23,2,1,6,0.122978741339848,0.296055215498298,0.712714162373228,0.914162405545687,1.06504760696993,0.375214144584023,1
-0.47748578293249,0.792779305484146,24,1,5,6,0.227992672902653,0.378540847371773,0.628499027203925,0.9254683679665,0.949484141121692,0.29364368150863,1
-0.384564548265189,0.153326370986179,25,2,2,5,0.147889891782409,0.0589638865954405,0.0235089760397912,0.414003463538894,0.634247405427742,0.365387395199715,1
-0.563622857443988,0.467359990812838,21,1,5,3,0.317670725433326,0.263414773476928,0.218425361012576,0.73218582781006,0.639414084578942,0.071506910079209,1
-0.343304847599939,0.854578266385943,22,2,1,6,0.117858218385617,0.293380861503846,0.730304013379203,0.920957236664559,1.07775346743350,0.387658506651072,1
-0.666085948701948,0.710089378990233,23,1,5,2,0.443670491058174,0.472980557667886,0.504226926154735,0.973600234805286,0.784681795257806,0.267809801016930,1
-0.190568120684475,0.0772022884339094,24,2,2,5,0.0363162086212125,0.0147122950193909,0.00596019333943254,0.205612261211838,0.813105258002736,0.523933195018469,1
-0.353534662164748,0.427994541125372,25,1,5,1,0.124986757351942,0.151310905505115,0.183179327233118,0.555127088678854,0.775304301713569,0.163208092002022,1
-0.127048352966085,0.927507144864649,21,2,1,4,0.0161412839913949,0.117838255119330,0.860269503774972,0.936168140755905,1.27370093893119,0.567322915045421,1
-0.960906301159412,0.891004979610443,22,2,3,7,0.923340919607862,0.856172299272088,0.793889873690606,1.31043152942016,0.891862204031343,0.604416671286136,1
-0.306814440060407,0.902291874401271,23,2,1,6,0.094135100629581,0.276836176215481,0.81413062661056,0.953029761990747,1.13782109627099,0.446272800849954,1
-0.087350245565176,0.671402548439801,24,2,6,4,0.00763006540029655,0.0586471774793016,0.450781382051459,0.677060889028273,1.13300968942079,0.446831795474291,1
-0.27015240653418,0.371201378758997,25,1,5,1,0.0729823227562089,0.100280945780549,0.137790463592580,0.459099974241765,0.81882108746687,0.263474858488646,1
-0.871842501685023,0.569787061074749,21,2,3,2,0.7601093477444,0.496764576755166,0.324657294968199,1.04152131169391,0.584021951079369,0.378334613738721,1
-0.686449621338397,0.169308491749689,22,2,4,10,0.471213082635629,0.116221750050949,0.0286653653785545,0.707020825728764,0.356341416814533,0.379631841296403,1
-0.67132937326096,0.571220482233912,23,1,5,2,0.450683127402953,0.383477088331915,0.326292839323543,0.881462402332905,0.659027480614106,0.185542747720368,1
-0.548616112209857,0.405350996181369,24,1,5,3,0.300979638576258,0.222382087605415,0.164309430105228,0.682121007359754,0.606676886210257,0.106404700508298,1
-0.677980388281867,0.993355110753328,25,2,3,9,0.459657406894831,0.673475283690318,0.986754376059756,1.20266860895036,1.04424662144096,0.524477152905055,1

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/resources/test-data.csv
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/resources/test-data.csv b/community/mahout-mr/examples/src/main/resources/test-data.csv
deleted file mode 100644
index ab683cd..0000000
--- a/community/mahout-mr/examples/src/main/resources/test-data.csv
+++ /dev/null
@@ -1,61 +0,0 @@
-"V1","V2","V3","V4","V5","V6","V7","V8","y"
-1,-0.212887381184450,-0.955959589855826,-0.00326541907490505,0.0560086232868742,0.091264583618544,0.0172194710825328,-0.0237399208336878,1
-1,3.14702017427074,2.12881054220556,-0.00566925018709358,-0.055626039510634,-0.0630510476335515,-0.00155145331201058,0.108559859662683,0
-1,-2.16541417186635,-2.71847685293678,-0.00833554984263851,0.0433655514274994,-0.102555485096075,-0.156155728366877,-0.0241458595902909,1
-1,-4.33686585982661,-2.6857484867589,-0.0115524101901378,0.122387581992154,0.081766215557828,-0.0206167352421607,-0.0424490760296281,1
-1,2.34100936064648,2.10958510331364,-0.0129315842415535,0.173866353524092,-0.0299915285951044,0.108136400830407,-0.0063355720943443,0
-1,1.30317270786224,3.37038662087804,-0.0230504278644102,-0.131884713919903,0.086455020204179,0.17337860146005,-0.0524355492943794,0
-1,1.94943481762617,3.54806480367192,-0.029538920288902,-0.0720379027720258,0.214306548234308,-0.082665692089578,0.226607475768828,0
-1,3.14635496849369,1.76134258264267,-0.0318247859223975,-0.187198080297378,-0.08576487890296,0.153638925055934,-0.0691201521844938,0
-1,-1.26105438936697,-1.95583819596755,-0.0367826492102569,-0.0936093811581598,-0.0317225362744449,-0.0840334569992295,-0.0627566339884115,1
-1,2.40442001058194,3.23077413487565,-0.0452264569747572,0.0371989606630366,-0.17352653795031,0.102543062447842,-0.0551882772900301,0
-1,-2.20940227045733,-0.175769402031962,-0.0465958462590872,0.130789407148096,-0.140283147466875,0.0708851428212228,0.0605244763586474,1
-1,-1.64710385829030,-2.57691366099069,-0.0553070134425288,-0.0349011715152424,-0.0826092377112715,0.106766133325393,-0.0585587032435851,1
-1,-2.6523724984616,-4.16903830585265,-0.0568310036349303,-0.0291979248790545,-0.255996825268056,0.0401827924643623,0.0179311252387879,1
-1,2.34337447158977,0.28996735916551,-0.0625800583342644,0.0899232083837452,0.0255207970332586,-0.0343458209061299,0.0755898049986344,0
-1,3.67556867120403,1.36097809464341,-0.0956707962851342,0.0537771695881714,-0.0373171704803031,0.0463473815328367,-0.228499359561800,0
-1,1.96533061882493,2.92646586187099,-0.103334098736041,-0.0194013528907574,0.0253359438067293,0.00748464018133427,-0.239745502177878,0
-1,-1.95041601303593,-0.860607985906108,-0.103721968898869,-0.00972933741506002,0.0227857854969761,-0.0287381002832544,-0.130156656165122,1
-1,-1.51543545229533,-1.35683836829949,-0.106483722717291,0.103877046729912,0.00840497101030744,0.0258430051020969,0.168907472637671,1
-1,1.45074382041585,1.88231080047069,-0.107681637419817,-0.00626324733854461,-0.144385489192821,0.00088239451623517,-0.00299885969569744,0
-1,3.87956616310254,4.31276421460554,-0.129963535661731,-0.0640782960295875,-0.0324909886960640,0.0428280701443882,0.0329254937199428,0
-1,-2.88187391546093,-3.16731558128991,-0.136390769151814,-0.155408895734766,0.105626409419800,-0.0918345772196075,0.197828194781600,1
-1,-2.65024496288248,-1.81147577507541,-0.145438998990911,0.0691687502404964,0.0749439097959056,-0.0674149410216342,0.123896965825847,1
-1,-1.37426198993006,-2.08894064826135,-0.153236566384176,0.0213513951854753,-0.134553043562400,0.00287304090325258,0.0122158739075685,1
-1,1.65698424179346,2.49004336804714,-0.153862461770005,0.105220938080375,-0.0946233303225818,-0.122426312548592,-0.00538234276442917,0
-1,2.93315586503758,2.75229115279104,-0.168877592929163,-0.0349207806558679,0.0189964813847077,0.202397029441612,0.0426299706123943,0
-1,-3.84306960373604,-2.35606387141237,-0.179511886850707,-0.0916819865200809,0.0265829433229566,0.101658708455140,-0.0855390303406673,1
-1,2.28101644492271,1.37963780647481,-0.180898801743387,-0.0789829066843624,-0.0779025366072777,0.0442621459868237,-0.136195159617836,0
-1,1.70008372335953,2.71018350574622,-0.188985514267118,-0.195856534813112,-0.106263419324547,-0.0311178988395261,-0.121173036989233,0
-1,-2.05613043162767,-1.73770126734937,0.00630625444849072,-0.134595964087825,0.0708994966210059,0.0739139562742148,-0.00416084523004362,1
-1,2.39375626983328,3.2468518382106,0.00951905535238045,-0.140380515724865,0.0630970962358967,0.00183192220061040,-0.0773483294293499,0
-1,4.26863682432937,3.49421800345979,0.0109175198048448,-0.109995560295421,-0.111585866731122,0.154763193427948,-0.0186987535307691,0
-1,1.54495296452702,3.17243560853872,0.0117478311845783,0.115838636637105,-0.1715332868224,0.0927292648278796,-0.0885962242970987,0
-1,2.16883227993245,1.63879588167162,0.0158863105366749,-0.00488771308802354,0.0280782748001184,0.131946735985038,0.066416828384239,0
-1,1.86427271422921,3.32026821853873,0.0162473257475520,0.0355005599857545,-0.0988825269654524,0.0527023072810735,0.100841323212596,0
-1,-3.03828333997027,-1.43214405751321,0.0247204684728272,0.146197859364444,0.0141171187314724,-0.201738256450160,0.044002672456105,1
-1,2.08595761680696,0.225336429607513,0.0335964287149376,0.0576493862055925,0.121452048491972,0.0640240734436852,0.224720096669846,0
-1,-1.85256114614442,-2.22817393781734,0.0346230650580488,0.160185441442375,0.0114059982858295,0.00496408500928602,-0.094156048483371,1
-1,2.33572915427688,1.03334367238243,0.0357824515834720,-0.172284120406131,0.0329286256184980,-0.101030665525296,-0.00238851979619332,0
-1,-2.00334039609229,-2.98875026257892,0.0375804284421083,0.142856636546252,-0.0862220203147005,-0.0441603903572752,0.0147126239348866,1
-1,2.38346139581192,1.21051372282823,0.0405425233313353,-0.145245065311593,-0.0216697981922324,-0.0128934036902430,-0.0325085994141851,0
-1,-1.15629168023471,-1.37784639006639,0.0429948703549178,-0.00491267793152886,0.0263522850749959,-0.0442602193050815,0.0582704866256344,1
-1,2.13230915550664,1.32833684701498,0.0434112538719301,-0.0296522957829338,0.00247091583877657,-0.123872403365319,-0.136549696313901,0
-1,-1.88291252343724,-1.99980946454726,0.0472833199907535,-0.0365284873908706,-0.0209054390489622,-0.0891896486647233,0.0542966824787834,1
-1,-1.34787394136153,-2.57763619051754,0.0493154843443071,0.0384664637019124,-0.00780509859650452,-0.118550134827935,0.00573215142098708,1
-1,-1.81748193199251,-2.72113041015796,0.0551479875680516,-0.255723061179778,-0.217672946803948,0.145106553357089,0.0632886151091758,1
-1,-3.13049595715861,-0.0285946551309455,0.0724437318718333,-0.0360911974267016,-0.121364676014540,0.038351368519738,-0.0125375424386282,1
-1,-2.3836883021805,-1.40162632998805,0.0746620557343183,0.069222624188286,0.04657285528431,0.0932835769596473,0.00836816351062604,1
-1,-2.43800450243598,-0.965440038635416,0.0763675021411913,-0.122575769653323,0.045866930905471,-0.0493852614669876,0.128116802512532,1
-1,1.09024638837653,2.21814920469686,0.0769910502309598,-0.270152593833931,-0.252735856082821,0.0661674666715274,-0.000429289775969046,0
-1,3.17642151475607,1.18015379683312,0.0776648965451875,-0.117234850817615,0.0759455286430382,0.119280079276134,0.117056969569811,0
-1,-3.5501372839931,-4.02435741321994,0.0833451415432366,-0.0185864612285970,0.0553371588028254,0.0269699189958747,-0.0930023774668385,1
-1,-2.85922019599943,-2.07644295605507,0.0903467736346066,0.124804691516462,0.0673015037344841,0.0234043567104492,0.0866115903248345,1
-1,0.513249476607372,5.0165612245778,0.0934321220365115,-0.0387550539552360,0.070129320868753,0.0635055975927393,-0.00773489793089484,0
-1,1.30094323285406,2.74698316868320,0.094239413405751,-0.105600040230387,-0.0134676903839459,0.00834379403909127,0.0978349326557826,0
-1,1.62511731278249,3.01296963021698,0.104352029985773,-0.0065839083200722,0.068460830526483,-0.1202220553,0.121998460927858,0
-1,1.82917662184333,2.89388269168932,0.110781239485760,-0.262387884050666,-0.00517657837760664,-0.0224028641246511,-0.108606003593092,0
-1,-3.17279743572930,-2.86698187406046,0.110873139279243,-0.093614374710967,0.0925974010859032,-0.00747619041107016,-0.066394213442664,1
-1,-3.20104938765970,-1.68043245593876,0.123227179211642,-0.00179275501686146,-0.175893752209014,-0.0835732816974749,0.0560957582079696,1
-1,-1.89923900052239,-2.92427973445236,0.147975477003611,0.00819675018680998,0.00470753628896422,-0.0122227288860826,0.209903875101594,1
-1,0.148491843864120,-1.54734877494689,0.162479731968606,0.112962938668545,-0.0100535803565242,0.0422099301034027,0.0752974779385111,1

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/test/java/org/apache/mahout/classifier/sgd/LogisticModelParametersTest.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/test/java/org/apache/mahout/classifier/sgd/LogisticModelParametersTest.java b/community/mahout-mr/examples/src/test/java/org/apache/mahout/classifier/sgd/LogisticModelParametersTest.java
deleted file mode 100644
index e849011..0000000
--- a/community/mahout-mr/examples/src/test/java/org/apache/mahout/classifier/sgd/LogisticModelParametersTest.java
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.classifier.sgd;
-
-import org.apache.mahout.common.MahoutTestCase;
-import org.junit.Test;
-
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-import java.util.Arrays;
-import java.util.Collections;
-
-public class LogisticModelParametersTest extends MahoutTestCase {
-
- @Test
- public void serializationWithoutCsv() throws IOException {
- LogisticModelParameters params = new LogisticModelParameters();
- params.setTargetVariable("foo");
- params.setTypeMap(Collections.<String, String>emptyMap());
- params.setTargetCategories(Arrays.asList("foo", "bar"));
- params.setNumFeatures(1);
- params.createRegression();
-
- //MAHOUT-1196 should work without "csv" being set
- params.saveTo(new ByteArrayOutputStream());
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/test/java/org/apache/mahout/classifier/sgd/ModelDissectorTest.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/test/java/org/apache/mahout/classifier/sgd/ModelDissectorTest.java b/community/mahout-mr/examples/src/test/java/org/apache/mahout/classifier/sgd/ModelDissectorTest.java
deleted file mode 100644
index c8e4879..0000000
--- a/community/mahout-mr/examples/src/test/java/org/apache/mahout/classifier/sgd/ModelDissectorTest.java
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.classifier.sgd;
-
-import org.apache.mahout.examples.MahoutTestCase;
-import org.apache.mahout.math.DenseVector;
-import org.junit.Test;
-
-public class ModelDissectorTest extends MahoutTestCase {
- @Test
- public void testCategoryOrdering() {
- ModelDissector.Weight w = new ModelDissector.Weight("a", new DenseVector(new double[]{-2, -5, 5, 2, 4, 1, 0}), 4);
- assertEquals(1, w.getCategory(0), 0);
- assertEquals(-5, w.getWeight(0), 0);
-
- assertEquals(2, w.getCategory(1), 0);
- assertEquals(5, w.getWeight(1), 0);
-
- assertEquals(4, w.getCategory(2), 0);
- assertEquals(4, w.getWeight(2), 0);
-
- assertEquals(0, w.getCategory(3), 0);
- assertEquals(-2, w.getWeight(3), 0);
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/test/java/org/apache/mahout/classifier/sgd/TrainLogisticTest.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/test/java/org/apache/mahout/classifier/sgd/TrainLogisticTest.java b/community/mahout-mr/examples/src/test/java/org/apache/mahout/classifier/sgd/TrainLogisticTest.java
deleted file mode 100644
index 4cde692..0000000
--- a/community/mahout-mr/examples/src/test/java/org/apache/mahout/classifier/sgd/TrainLogisticTest.java
+++ /dev/null
@@ -1,167 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.classifier.sgd;
-
-import com.google.common.base.Charsets;
-import com.google.common.collect.ImmutableMap;
-import com.google.common.collect.Sets;
-import com.google.common.io.Resources;
-import org.apache.mahout.classifier.AbstractVectorClassifier;
-import org.apache.mahout.examples.MahoutTestCase;
-import org.apache.mahout.math.DenseVector;
-import org.apache.mahout.math.Vector;
-import org.junit.Test;
-
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.InputStream;
-import java.io.PrintWriter;
-import java.io.StringWriter;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-import java.util.TreeSet;
-
-public class TrainLogisticTest extends MahoutTestCase {
-
- @Test
- public void example131() throws Exception {
- String outputFile = getTestTempFile("model").getAbsolutePath();
-
- StringWriter sw = new StringWriter();
- PrintWriter pw = new PrintWriter(sw, true);
- TrainLogistic.mainToOutput(new String[]{
- "--input", "donut.csv",
- "--output", outputFile,
- "--target", "color", "--categories", "2",
- "--predictors", "x", "y",
- "--types", "numeric",
- "--features", "20",
- "--passes", "100",
- "--rate", "50"
- }, pw);
- String trainOut = sw.toString();
- assertTrue(trainOut.contains("x -0.7"));
- assertTrue(trainOut.contains("y -0.4"));
-
- LogisticModelParameters lmp = TrainLogistic.getParameters();
- assertEquals(1.0e-4, lmp.getLambda(), 1.0e-9);
- assertEquals(20, lmp.getNumFeatures());
- assertTrue(lmp.useBias());
- assertEquals("color", lmp.getTargetVariable());
- CsvRecordFactory csv = lmp.getCsvRecordFactory();
- assertEquals("[1, 2]", new TreeSet<>(csv.getTargetCategories()).toString());
- assertEquals("[Intercept Term, x, y]", Sets.newTreeSet(csv.getPredictors()).toString());
-
- // verify model by building dissector
- AbstractVectorClassifier model = TrainLogistic.getModel();
- List<String> data = Resources.readLines(Resources.getResource("donut.csv"), Charsets.UTF_8);
- Map<String, Double> expectedValues = ImmutableMap.of("x", -0.7, "y", -0.43, "Intercept Term", -0.15);
- verifyModel(lmp, csv, data, model, expectedValues);
-
- // test saved model
- try (InputStream in = new FileInputStream(new File(outputFile))){
- LogisticModelParameters lmpOut = LogisticModelParameters.loadFrom(in);
- CsvRecordFactory csvOut = lmpOut.getCsvRecordFactory();
- csvOut.firstLine(data.get(0));
- OnlineLogisticRegression lrOut = lmpOut.createRegression();
- verifyModel(lmpOut, csvOut, data, lrOut, expectedValues);
- }
-
- sw = new StringWriter();
- pw = new PrintWriter(sw, true);
- RunLogistic.mainToOutput(new String[]{
- "--input", "donut.csv",
- "--model", outputFile,
- "--auc",
- "--confusion"
- }, pw);
- trainOut = sw.toString();
- assertTrue(trainOut.contains("AUC = 0.57"));
- assertTrue(trainOut.contains("confusion: [[27.0, 13.0], [0.0, 0.0]]"));
- }
-
- @Test
- public void example132() throws Exception {
- String outputFile = getTestTempFile("model").getAbsolutePath();
-
- StringWriter sw = new StringWriter();
- PrintWriter pw = new PrintWriter(sw, true);
- TrainLogistic.mainToOutput(new String[]{
- "--input", "donut.csv",
- "--output", outputFile,
- "--target", "color",
- "--categories", "2",
- "--predictors", "x", "y", "a", "b", "c",
- "--types", "numeric",
- "--features", "20",
- "--passes", "100",
- "--rate", "50"
- }, pw);
-
- String trainOut = sw.toString();
- assertTrue(trainOut.contains("a 0."));
- assertTrue(trainOut.contains("b -1."));
- assertTrue(trainOut.contains("c -25."));
-
- sw = new StringWriter();
- pw = new PrintWriter(sw, true);
- RunLogistic.mainToOutput(new String[]{
- "--input", "donut.csv",
- "--model", outputFile,
- "--auc",
- "--confusion"
- }, pw);
- trainOut = sw.toString();
- assertTrue(trainOut.contains("AUC = 1.00"));
-
- sw = new StringWriter();
- pw = new PrintWriter(sw, true);
- RunLogistic.mainToOutput(new String[]{
- "--input", "donut-test.csv",
- "--model", outputFile,
- "--auc",
- "--confusion"
- }, pw);
- trainOut = sw.toString();
- assertTrue(trainOut.contains("AUC = 0.9"));
- }
-
- private static void verifyModel(LogisticModelParameters lmp,
- RecordFactory csv,
- List<String> data,
- AbstractVectorClassifier model,
- Map<String, Double> expectedValues) {
- ModelDissector md = new ModelDissector();
- for (String line : data.subList(1, data.size())) {
- Vector v = new DenseVector(lmp.getNumFeatures());
- csv.getTraceDictionary().clear();
- csv.processLine(line, v);
- md.update(v, csv.getTraceDictionary(), model);
- }
-
- // check right variables are present
- List<ModelDissector.Weight> weights = md.summary(10);
- Set<String> expected = Sets.newHashSet(expectedValues.keySet());
- for (ModelDissector.Weight weight : weights) {
- assertTrue(expected.remove(weight.getFeature()));
- assertEquals(expectedValues.get(weight.getFeature()), weight.getWeight(), 0.1);
- }
- assertEquals(0, expected.size());
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/test/java/org/apache/mahout/clustering/display/ClustersFilterTest.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/test/java/org/apache/mahout/clustering/display/ClustersFilterTest.java b/community/mahout-mr/examples/src/test/java/org/apache/mahout/clustering/display/ClustersFilterTest.java
deleted file mode 100644
index 6e43b97..0000000
--- a/community/mahout-mr/examples/src/test/java/org/apache/mahout/clustering/display/ClustersFilterTest.java
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.clustering.display;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.fs.PathFilter;
-import org.apache.mahout.common.MahoutTestCase;
-import org.junit.Before;
-import org.junit.Test;
-
-import java.io.IOException;
-
-public class ClustersFilterTest extends MahoutTestCase {
-
- private Configuration configuration;
- private Path output;
-
- @Override
- @Before
- public void setUp() throws Exception {
- super.setUp();
- configuration = getConfiguration();
- output = getTestTempDirPath();
- }
-
- @Test
- public void testAcceptNotFinal() throws Exception {
- Path path0 = new Path(output, "clusters-0");
- Path path1 = new Path(output, "clusters-1");
-
- path0.getFileSystem(configuration).createNewFile(path0);
- path1.getFileSystem(configuration).createNewFile(path1);
-
- PathFilter clustersFilter = new ClustersFilter();
-
- assertTrue(clustersFilter.accept(path0));
- assertTrue(clustersFilter.accept(path1));
- }
-
- @Test
- public void testAcceptFinalPath() throws IOException {
- Path path0 = new Path(output, "clusters-0");
- Path path1 = new Path(output, "clusters-1");
- Path path2 = new Path(output, "clusters-2");
- Path path3Final = new Path(output, "clusters-3-final");
-
- path0.getFileSystem(configuration).createNewFile(path0);
- path1.getFileSystem(configuration).createNewFile(path1);
- path2.getFileSystem(configuration).createNewFile(path2);
- path3Final.getFileSystem(configuration).createNewFile(path3Final);
-
- PathFilter clustersFilter = new ClustersFilter();
-
- assertTrue(clustersFilter.accept(path0));
- assertTrue(clustersFilter.accept(path1));
- assertTrue(clustersFilter.accept(path2));
- assertTrue(clustersFilter.accept(path3Final));
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/test/java/org/apache/mahout/examples/MahoutTestCase.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/test/java/org/apache/mahout/examples/MahoutTestCase.java b/community/mahout-mr/examples/src/test/java/org/apache/mahout/examples/MahoutTestCase.java
deleted file mode 100644
index 4d81e3f..0000000
--- a/community/mahout-mr/examples/src/test/java/org/apache/mahout/examples/MahoutTestCase.java
+++ /dev/null
@@ -1,30 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.examples;
-
-/**
- * This class should not exist. It's here to work around some bizarre problem in Maven
- * dependency management wherein it can see methods in {@link org.apache.mahout.common.MahoutTestCase}
- * but not constants. Duplicated here to make it jive.
- */
-public abstract class MahoutTestCase extends org.apache.mahout.common.MahoutTestCase {
-
- /** "Close enough" value for floating-point comparisons. */
- public static final double EPSILON = 0.000001;
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/test/resources/country.txt
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/test/resources/country.txt b/community/mahout-mr/examples/src/test/resources/country.txt
deleted file mode 100644
index 6a22091..0000000
--- a/community/mahout-mr/examples/src/test/resources/country.txt
+++ /dev/null
@@ -1,229 +0,0 @@
-Afghanistan
-Albania
-Algeria
-American Samoa
-Andorra
-Angola
-Anguilla
-Antigua and Barbuda
-Argentina
-Armenia
-Aruba
-Australia
-Austria
-Azerbaijan
-Bahamas
-Bangladesh
-Barbados
-Belarus
-Belgium
-Belize
-Benin
-Bermuda
-Bhutan
-Bolivia
-Bosnia and Herzegovina
-Botswana
-Bouvet Island
-Brazil
-British Indian Ocean Territory
-Brunei Darussalam
-Bulgaria
-Burkina Faso
-Burundi
-Cambodia
-Cameroon
-Canada
-Cape Verde
-Cayman Islands
-Central African Republic
-Chad
-Chile
-China
-Christmas Island
-Cocos Islands
-Colombia
-Comoros
-Congo
-Cook Islands
-Costa Rica
-Croatia
-C�te d'Ivoire
-Cuba
-Cyprus
-Czech Republic
-Djibouti
-Dominica
-Dominican Republic
-Ecuador
-Egypt
-El Salvador
-Equatorial Guinea
-Eritrea
-Estonia
-Ethiopia
-Falkland Islands
-Faroe Islands
-Fiji
-Finland
-France
-French Guiana
-French Polynesia
-French Southern Territories
-Gabon
-Georgia
-Germany
-Ghana
-Gibraltar
-Greece
-Greenland
-Grenada
-Guadeloupe
-Guam
-Guatemala
-Guernsey
-Guinea
-Guinea-Bissau
-Guyana
-Haiti
-Honduras
-Hong Kong
-Hungary
-Iceland
-India
-Indonesia
-Iran
-Iraq
-Ireland
-Isle of Man
-Israel
-Italy
-Japan
-Jersey
-Jordan
-Kazakhstan
-Kenya
-Kiribati
-Korea
-Kuwait
-Kyrgyzstan
-Latvia
-Lebanon
-Lesotho
-Liberia
-Liechtenstein
-Lithuania
-Luxembourg
-Macedonia
-Madagascar
-Malawi
-Malaysia
-Maldives
-Mali
-Malta
-Marshall Islands
-Martinique
-Mauritania
-Mauritius
-Mayotte
-Mexico
-Micronesia
-Moldova
-Monaco
-Mongolia
-Montenegro
-Montserrat
-Morocco
-Mozambique
-Myanmar
-Namibia
-Nauru
-Nepal
-Netherlands
-Netherlands Antilles
-New Caledonia
-New Zealand
-Nicaragua
-Niger
-Nigeria
-Niue
-Norfolk Island
-Northern Mariana Islands
-Norway
-Oman
-Pakistan
-Palau
-Palestinian Territory
-Panama
-Papua New Guinea
-Paraguay
-Peru
-Philippines
-Pitcairn
-Poland
-Portugal
-Puerto Rico
-Qatar
-R�union
-Russian Federation
-Rwanda
-Saint Barth�lemy
-Saint Helena
-Saint Kitts and Nevis
-Saint Lucia
-Saint Martin
-Saint Pierre and Miquelon
-Saint Vincent and the Grenadines
-Samoa
-San Marino
-Sao Tome and Principe
-Saudi Arabia
-Senegal
-Serbia
-Seychelles
-Sierra Leone
-Singapore
-Slovakia
-Slovenia
-Solomon Islands
-Somalia
-South Africa
-South Georgia and the South Sandwich Islands
-Spain
-Sri Lanka
-Sudan
-Suriname
-Svalbard and Jan Mayen
-Swaziland
-Sweden
-Switzerland
-Syrian Arab Republic
-Taiwan
-Tanzania
-Thailand
-Timor-Leste
-Togo
-Tokelau
-Tonga
-Trinidad and Tobago
-Tunisia
-Turkey
-Turkmenistan
-Turks and Caicos Islands
-Tuvalu
-Ukraine
-United Arab Emirates
-United Kingdom
-United States
-United States Minor Outlying Islands
-Uruguay
-Uzbekistan
-Vanuatu
-Vatican
-Venezuela
-Vietnam
-Virgin Islands
-Wallis and Futuna
-Yemen
-Zambia
-Zimbabwe

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/test/resources/country10.txt
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/test/resources/country10.txt b/community/mahout-mr/examples/src/test/resources/country10.txt
deleted file mode 100644
index 97a63e1..0000000
--- a/community/mahout-mr/examples/src/test/resources/country10.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-Australia
-Austria
-Bahamas
-Canada
-Colombia
-Cuba
-Panama
-Pakistan
-United Kingdom
-Vietnam

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/test/resources/country2.txt
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/test/resources/country2.txt b/community/mahout-mr/examples/src/test/resources/country2.txt
deleted file mode 100644
index f4b4f61..0000000
--- a/community/mahout-mr/examples/src/test/resources/country2.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-United States
-United Kingdom

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/test/resources/subjects.txt
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/test/resources/subjects.txt b/community/mahout-mr/examples/src/test/resources/subjects.txt
deleted file mode 100644
index f52ae33..0000000
--- a/community/mahout-mr/examples/src/test/resources/subjects.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-Science
-History

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/test/resources/wdbc.infos
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/test/resources/wdbc.infos b/community/mahout-mr/examples/src/test/resources/wdbc.infos
deleted file mode 100644
index 94a63d6..0000000
--- a/community/mahout-mr/examples/src/test/resources/wdbc.infos
+++ /dev/null
@@ -1,32 +0,0 @@
-IGNORED
-LABEL, B, M
-NUMERICAL, 6.9, 28.2
-NUMERICAL, 9.7, 39.3
-NUMERICAL, 43.7, 188.5
-NUMERICAL, 143.5, 2501.0
-NUMERICAL, 0.0, 0.2
-NUMERICAL, 0.0, 0.4
-NUMERICAL, 0.0, 0.5
-NUMERICAL, 0.0, 0.3
-NUMERICAL, 0.1, 0.4
-NUMERICAL, 0.0, 0.1
-NUMERICAL, 0.1, 2.9
-NUMERICAL, 0.3, 4.9
-NUMERICAL, 0.7, 22.0
-NUMERICAL, 6.8, 542.3
-NUMERICAL, 0.0, 0.1
-NUMERICAL, 0.0, 0.2
-NUMERICAL, 0.0, 0.4
-NUMERICAL, 0.0, 0.1
-NUMERICAL, 0.0, 0.1
-NUMERICAL, 0.0, 0.1
-NUMERICAL, 7.9, 36.1
-NUMERICAL, 12.0, 49.6
-NUMERICAL, 50.4, 251.2
-NUMERICAL, 185.2, 4254.0
-NUMERICAL, 0.0, 0.3
-NUMERICAL, 0.0, 1.1
-NUMERICAL, 0.0, 1.3
-NUMERICAL, 0.0, 0.3
-NUMERICAL, 0.1, 0.7
-NUMERICAL, 0.0, 0.3
r***@apache.org
2018-06-28 14:55:13 UTC
Permalink
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/AdaptiveLogisticModelParameters.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/AdaptiveLogisticModelParameters.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/AdaptiveLogisticModelParameters.java
deleted file mode 100644
index b2ce8b1..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/AdaptiveLogisticModelParameters.java
+++ /dev/null
@@ -1,236 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.classifier.sgd;
-
-import org.apache.mahout.math.stats.GlobalOnlineAuc;
-import org.apache.mahout.math.stats.GroupedOnlineAuc;
-import org.apache.mahout.math.stats.OnlineAuc;
-
-import java.io.DataInput;
-import java.io.DataInputStream;
-import java.io.DataOutput;
-import java.io.DataOutputStream;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Locale;
-import java.util.Map;
-
-public class AdaptiveLogisticModelParameters extends LogisticModelParameters {
-
- private AdaptiveLogisticRegression alr;
- private int interval = 800;
- private int averageWindow = 500;
- private int threads = 4;
- private String prior = "L1";
- private double priorOption = Double.NaN;
- private String auc = null;
-
- public AdaptiveLogisticRegression createAdaptiveLogisticRegression() {
-
- if (alr == null) {
- alr = new AdaptiveLogisticRegression(getMaxTargetCategories(),
- getNumFeatures(), createPrior(prior, priorOption));
- alr.setInterval(interval);
- alr.setAveragingWindow(averageWindow);
- alr.setThreadCount(threads);
- alr.setAucEvaluator(createAUC(auc));
- }
- return alr;
- }
-
- public void checkParameters() {
- if (prior != null) {
- String priorUppercase = prior.toUpperCase(Locale.ENGLISH).trim();
- if (("TP".equals(priorUppercase) || "EBP".equals(priorUppercase)) && Double.isNaN(priorOption)) {
- throw new IllegalArgumentException("You must specify a double value for TPrior and ElasticBandPrior.");
- }
- }
- }
-
- private static PriorFunction createPrior(String cmd, double priorOption) {
- if (cmd == null) {
- return null;
- }
- if ("L1".equals(cmd.toUpperCase(Locale.ENGLISH).trim())) {
- return new L1();
- }
- if ("L2".equals(cmd.toUpperCase(Locale.ENGLISH).trim())) {
- return new L2();
- }
- if ("UP".equals(cmd.toUpperCase(Locale.ENGLISH).trim())) {
- return new UniformPrior();
- }
- if ("TP".equals(cmd.toUpperCase(Locale.ENGLISH).trim())) {
- return new TPrior(priorOption);
- }
- if ("EBP".equals(cmd.toUpperCase(Locale.ENGLISH).trim())) {
- return new ElasticBandPrior(priorOption);
- }
-
- return null;
- }
-
- private static OnlineAuc createAUC(String cmd) {
- if (cmd == null) {
- return null;
- }
- if ("GLOBAL".equals(cmd.toUpperCase(Locale.ENGLISH).trim())) {
- return new GlobalOnlineAuc();
- }
- if ("GROUPED".equals(cmd.toUpperCase(Locale.ENGLISH).trim())) {
- return new GroupedOnlineAuc();
- }
- return null;
- }
-
- @Override
- public void saveTo(OutputStream out) throws IOException {
- if (alr != null) {
- alr.close();
- }
- setTargetCategories(getCsvRecordFactory().getTargetCategories());
- write(new DataOutputStream(out));
- }
-
- @Override
- public void write(DataOutput out) throws IOException {
- out.writeUTF(getTargetVariable());
- out.writeInt(getTypeMap().size());
- for (Map.Entry<String, String> entry : getTypeMap().entrySet()) {
- out.writeUTF(entry.getKey());
- out.writeUTF(entry.getValue());
- }
- out.writeInt(getNumFeatures());
- out.writeInt(getMaxTargetCategories());
- out.writeInt(getTargetCategories().size());
- for (String category : getTargetCategories()) {
- out.writeUTF(category);
- }
-
- out.writeInt(interval);
- out.writeInt(averageWindow);
- out.writeInt(threads);
- out.writeUTF(prior);
- out.writeDouble(priorOption);
- out.writeUTF(auc);
-
- // skip csv
- alr.write(out);
- }
-
- @Override
- public void readFields(DataInput in) throws IOException {
- setTargetVariable(in.readUTF());
- int typeMapSize = in.readInt();
- Map<String, String> typeMap = new HashMap<>(typeMapSize);
- for (int i = 0; i < typeMapSize; i++) {
- String key = in.readUTF();
- String value = in.readUTF();
- typeMap.put(key, value);
- }
- setTypeMap(typeMap);
-
- setNumFeatures(in.readInt());
- setMaxTargetCategories(in.readInt());
- int targetCategoriesSize = in.readInt();
- List<String> targetCategories = new ArrayList<>(targetCategoriesSize);
- for (int i = 0; i < targetCategoriesSize; i++) {
- targetCategories.add(in.readUTF());
- }
- setTargetCategories(targetCategories);
-
- interval = in.readInt();
- averageWindow = in.readInt();
- threads = in.readInt();
- prior = in.readUTF();
- priorOption = in.readDouble();
- auc = in.readUTF();
-
- alr = new AdaptiveLogisticRegression();
- alr.readFields(in);
- }
-
-
- private static AdaptiveLogisticModelParameters loadFromStream(InputStream in) throws IOException {
- AdaptiveLogisticModelParameters result = new AdaptiveLogisticModelParameters();
- result.readFields(new DataInputStream(in));
- return result;
- }
-
- public static AdaptiveLogisticModelParameters loadFromFile(File in) throws IOException {
- try (InputStream input = new FileInputStream(in)) {
- return loadFromStream(input);
- }
- }
-
- public int getInterval() {
- return interval;
- }
-
- public void setInterval(int interval) {
- this.interval = interval;
- }
-
- public int getAverageWindow() {
- return averageWindow;
- }
-
- public void setAverageWindow(int averageWindow) {
- this.averageWindow = averageWindow;
- }
-
- public int getThreads() {
- return threads;
- }
-
- public void setThreads(int threads) {
- this.threads = threads;
- }
-
- public String getPrior() {
- return prior;
- }
-
- public void setPrior(String prior) {
- this.prior = prior;
- }
-
- public String getAuc() {
- return auc;
- }
-
- public void setAuc(String auc) {
- this.auc = auc;
- }
-
- public double getPriorOption() {
- return priorOption;
- }
-
- public void setPriorOption(double priorOption) {
- this.priorOption = priorOption;
- }
-
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/LogisticModelParameters.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/LogisticModelParameters.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/LogisticModelParameters.java
deleted file mode 100644
index e762924..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/LogisticModelParameters.java
+++ /dev/null
@@ -1,265 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.classifier.sgd;
-
-import com.google.common.base.Preconditions;
-import com.google.common.io.Closeables;
-import java.io.DataInput;
-import java.io.DataInputStream;
-import java.io.DataOutput;
-import java.io.DataOutputStream;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-import org.apache.hadoop.io.Writable;
-
-/**
- * Encapsulates everything we need to know about a model and how it reads and vectorizes its input.
- * This encapsulation allows us to coherently save and restore a model from a file. This also
- * allows us to keep command line arguments that affect learning in a coherent way.
- */
-public class LogisticModelParameters implements Writable {
- private String targetVariable;
- private Map<String, String> typeMap;
- private int numFeatures;
- private boolean useBias;
- private int maxTargetCategories;
- private List<String> targetCategories;
- private double lambda;
- private double learningRate;
- private CsvRecordFactory csv;
- private OnlineLogisticRegression lr;
-
- /**
- * Returns a CsvRecordFactory compatible with this logistic model. The reason that this is tied
- * in here is so that we have access to the list of target categories when it comes time to save
- * the model. If the input isn't CSV, then calling setTargetCategories before calling saveTo will
- * suffice.
- *
- * @return The CsvRecordFactory.
- */
- public CsvRecordFactory getCsvRecordFactory() {
- if (csv == null) {
- csv = new CsvRecordFactory(getTargetVariable(), getTypeMap())
- .maxTargetValue(getMaxTargetCategories())
- .includeBiasTerm(useBias());
- if (targetCategories != null) {
- csv.defineTargetCategories(targetCategories);
- }
- }
- return csv;
- }
-
- /**
- * Creates a logistic regression trainer using the parameters collected here.
- *
- * @return The newly allocated OnlineLogisticRegression object
- */
- public OnlineLogisticRegression createRegression() {
- if (lr == null) {
- lr = new OnlineLogisticRegression(getMaxTargetCategories(), getNumFeatures(), new L1())
- .lambda(getLambda())
- .learningRate(getLearningRate())
- .alpha(1 - 1.0e-3);
- }
- return lr;
- }
-
- /**
- * Saves a model to an output stream.
- */
- public void saveTo(OutputStream out) throws IOException {
- Closeables.close(lr, false);
- targetCategories = getCsvRecordFactory().getTargetCategories();
- write(new DataOutputStream(out));
- }
-
- /**
- * Reads a model from a stream.
- */
- public static LogisticModelParameters loadFrom(InputStream in) throws IOException {
- LogisticModelParameters result = new LogisticModelParameters();
- result.readFields(new DataInputStream(in));
- return result;
- }
-
- /**
- * Reads a model from a file.
- * @throws IOException If there is an error opening or closing the file.
- */
- public static LogisticModelParameters loadFrom(File in) throws IOException {
- try (InputStream input = new FileInputStream(in)) {
- return loadFrom(input);
- }
- }
-
-
- @Override
- public void write(DataOutput out) throws IOException {
- out.writeUTF(targetVariable);
- out.writeInt(typeMap.size());
- for (Map.Entry<String,String> entry : typeMap.entrySet()) {
- out.writeUTF(entry.getKey());
- out.writeUTF(entry.getValue());
- }
- out.writeInt(numFeatures);
- out.writeBoolean(useBias);
- out.writeInt(maxTargetCategories);
-
- if (targetCategories == null) {
- out.writeInt(0);
- } else {
- out.writeInt(targetCategories.size());
- for (String category : targetCategories) {
- out.writeUTF(category);
- }
- }
- out.writeDouble(lambda);
- out.writeDouble(learningRate);
- // skip csv
- lr.write(out);
- }
-
- @Override
- public void readFields(DataInput in) throws IOException {
- targetVariable = in.readUTF();
- int typeMapSize = in.readInt();
- typeMap = new HashMap<>(typeMapSize);
- for (int i = 0; i < typeMapSize; i++) {
- String key = in.readUTF();
- String value = in.readUTF();
- typeMap.put(key, value);
- }
- numFeatures = in.readInt();
- useBias = in.readBoolean();
- maxTargetCategories = in.readInt();
- int targetCategoriesSize = in.readInt();
- targetCategories = new ArrayList<>(targetCategoriesSize);
- for (int i = 0; i < targetCategoriesSize; i++) {
- targetCategories.add(in.readUTF());
- }
- lambda = in.readDouble();
- learningRate = in.readDouble();
- csv = null;
- lr = new OnlineLogisticRegression();
- lr.readFields(in);
- }
-
- /**
- * Sets the types of the predictors. This will later be used when reading CSV data. If you don't
- * use the CSV data and convert to vectors on your own, you don't need to call this.
- *
- * @param predictorList The list of variable names.
- * @param typeList The list of types in the format preferred by CsvRecordFactory.
- */
- public void setTypeMap(Iterable<String> predictorList, List<String> typeList) {
- Preconditions.checkArgument(!typeList.isEmpty(), "Must have at least one type specifier");
- typeMap = new HashMap<>();
- Iterator<String> iTypes = typeList.iterator();
- String lastType = null;
- for (Object x : predictorList) {
- // type list can be short .. we just repeat last spec
- if (iTypes.hasNext()) {
- lastType = iTypes.next();
- }
- typeMap.put(x.toString(), lastType);
- }
- }
-
- /**
- * Sets the target variable. If you don't use the CSV record factory, then this is irrelevant.
- *
- * @param targetVariable The name of the target variable.
- */
- public void setTargetVariable(String targetVariable) {
- this.targetVariable = targetVariable;
- }
-
- /**
- * Sets the number of target categories to be considered.
- *
- * @param maxTargetCategories The number of target categories.
- */
- public void setMaxTargetCategories(int maxTargetCategories) {
- this.maxTargetCategories = maxTargetCategories;
- }
-
- public void setNumFeatures(int numFeatures) {
- this.numFeatures = numFeatures;
- }
-
- public void setTargetCategories(List<String> targetCategories) {
- this.targetCategories = targetCategories;
- maxTargetCategories = targetCategories.size();
- }
-
- public List<String> getTargetCategories() {
- return this.targetCategories;
- }
-
- public void setUseBias(boolean useBias) {
- this.useBias = useBias;
- }
-
- public boolean useBias() {
- return useBias;
- }
-
- public String getTargetVariable() {
- return targetVariable;
- }
-
- public Map<String, String> getTypeMap() {
- return typeMap;
- }
-
- public void setTypeMap(Map<String, String> map) {
- this.typeMap = map;
- }
-
- public int getNumFeatures() {
- return numFeatures;
- }
-
- public int getMaxTargetCategories() {
- return maxTargetCategories;
- }
-
- public double getLambda() {
- return lambda;
- }
-
- public void setLambda(double lambda) {
- this.lambda = lambda;
- }
-
- public double getLearningRate() {
- return learningRate;
- }
-
- public void setLearningRate(double learningRate) {
- this.learningRate = learningRate;
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/PrintResourceOrFile.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/PrintResourceOrFile.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/PrintResourceOrFile.java
deleted file mode 100644
index 3ec6a06..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/PrintResourceOrFile.java
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.classifier.sgd;
-
-import com.google.common.base.Preconditions;
-
-import java.io.BufferedReader;
-
-/**
- * Uses the same logic as TrainLogistic and RunLogistic for finding an input, but instead
- * of processing the input, this class just prints the input to standard out.
- */
-public final class PrintResourceOrFile {
-
- private PrintResourceOrFile() {
- }
-
- public static void main(String[] args) throws Exception {
- Preconditions.checkArgument(args.length == 1, "Must have a single argument that names a file or resource.");
- try (BufferedReader in = TrainLogistic.open(args[0])){
- String line;
- while ((line = in.readLine()) != null) {
- System.out.println(line);
- }
- }
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/RunAdaptiveLogistic.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/RunAdaptiveLogistic.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/RunAdaptiveLogistic.java
deleted file mode 100644
index 678a8f5..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/RunAdaptiveLogistic.java
+++ /dev/null
@@ -1,197 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.classifier.sgd;
-
-import org.apache.commons.cli2.CommandLine;
-import org.apache.commons.cli2.Group;
-import org.apache.commons.cli2.Option;
-import org.apache.commons.cli2.builder.ArgumentBuilder;
-import org.apache.commons.cli2.builder.DefaultOptionBuilder;
-import org.apache.commons.cli2.builder.GroupBuilder;
-import org.apache.commons.cli2.commandline.Parser;
-import org.apache.commons.cli2.util.HelpFormatter;
-import org.apache.commons.io.Charsets;
-import org.apache.mahout.classifier.sgd.AdaptiveLogisticRegression.Wrapper;
-import org.apache.mahout.ep.State;
-import org.apache.mahout.math.SequentialAccessSparseVector;
-import org.apache.mahout.math.Vector;
-
-import java.io.BufferedReader;
-import java.io.BufferedWriter;
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.OutputStreamWriter;
-import java.io.PrintWriter;
-import java.util.HashMap;
-import java.util.Map;
-
-public final class RunAdaptiveLogistic {
-
- private static String inputFile;
- private static String modelFile;
- private static String outputFile;
- private static String idColumn;
- private static boolean maxScoreOnly;
-
- private RunAdaptiveLogistic() {
- }
-
- public static void main(String[] args) throws Exception {
- mainToOutput(args, new PrintWriter(new OutputStreamWriter(System.out, Charsets.UTF_8), true));
- }
-
- static void mainToOutput(String[] args, PrintWriter output) throws Exception {
- if (!parseArgs(args)) {
- return;
- }
- AdaptiveLogisticModelParameters lmp = AdaptiveLogisticModelParameters
- .loadFromFile(new File(modelFile));
-
- CsvRecordFactory csv = lmp.getCsvRecordFactory();
- csv.setIdName(idColumn);
-
- AdaptiveLogisticRegression lr = lmp.createAdaptiveLogisticRegression();
-
- State<Wrapper, CrossFoldLearner> best = lr.getBest();
- if (best == null) {
- output.println("AdaptiveLogisticRegression has not be trained probably.");
- return;
- }
- CrossFoldLearner learner = best.getPayload().getLearner();
-
- BufferedReader in = TrainAdaptiveLogistic.open(inputFile);
- int k = 0;
-
- try (BufferedWriter out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outputFile),
- Charsets.UTF_8))) {
- out.write(idColumn + ",target,score");
- out.newLine();
-
- String line = in.readLine();
- csv.firstLine(line);
- line = in.readLine();
- Map<String, Double> results = new HashMap<>();
- while (line != null) {
- Vector v = new SequentialAccessSparseVector(lmp.getNumFeatures());
- csv.processLine(line, v, false);
- Vector scores = learner.classifyFull(v);
- results.clear();
- if (maxScoreOnly) {
- results.put(csv.getTargetLabel(scores.maxValueIndex()),
- scores.maxValue());
- } else {
- for (int i = 0; i < scores.size(); i++) {
- results.put(csv.getTargetLabel(i), scores.get(i));
- }
- }
-
- for (Map.Entry<String, Double> entry : results.entrySet()) {
- out.write(csv.getIdString(line) + ',' + entry.getKey() + ',' + entry.getValue());
- out.newLine();
- }
- k++;
- if (k % 100 == 0) {
- output.println(k + " records processed");
- }
- line = in.readLine();
- }
- out.flush();
- }
- output.println(k + " records processed totally.");
- }
-
- private static boolean parseArgs(String[] args) {
- DefaultOptionBuilder builder = new DefaultOptionBuilder();
-
- Option help = builder.withLongName("help")
- .withDescription("print this list").create();
-
- Option quiet = builder.withLongName("quiet")
- .withDescription("be extra quiet").create();
-
- ArgumentBuilder argumentBuilder = new ArgumentBuilder();
- Option inputFileOption = builder
- .withLongName("input")
- .withRequired(true)
- .withArgument(
- argumentBuilder.withName("input").withMaximum(1)
- .create())
- .withDescription("where to get training data").create();
-
- Option modelFileOption = builder
- .withLongName("model")
- .withRequired(true)
- .withArgument(
- argumentBuilder.withName("model").withMaximum(1)
- .create())
- .withDescription("where to get the trained model").create();
-
- Option outputFileOption = builder
- .withLongName("output")
- .withRequired(true)
- .withDescription("the file path to output scores")
- .withArgument(argumentBuilder.withName("output").withMaximum(1).create())
- .create();
-
- Option idColumnOption = builder
- .withLongName("idcolumn")
- .withRequired(true)
- .withDescription("the name of the id column for each record")
- .withArgument(argumentBuilder.withName("idcolumn").withMaximum(1).create())
- .create();
-
- Option maxScoreOnlyOption = builder
- .withLongName("maxscoreonly")
- .withDescription("only output the target label with max scores")
- .create();
-
- Group normalArgs = new GroupBuilder()
- .withOption(help).withOption(quiet)
- .withOption(inputFileOption).withOption(modelFileOption)
- .withOption(outputFileOption).withOption(idColumnOption)
- .withOption(maxScoreOnlyOption)
- .create();
-
- Parser parser = new Parser();
- parser.setHelpOption(help);
- parser.setHelpTrigger("--help");
- parser.setGroup(normalArgs);
- parser.setHelpFormatter(new HelpFormatter(" ", "", " ", 130));
- CommandLine cmdLine = parser.parseAndHelp(args);
-
- if (cmdLine == null) {
- return false;
- }
-
- inputFile = getStringArgument(cmdLine, inputFileOption);
- modelFile = getStringArgument(cmdLine, modelFileOption);
- outputFile = getStringArgument(cmdLine, outputFileOption);
- idColumn = getStringArgument(cmdLine, idColumnOption);
- maxScoreOnly = getBooleanArgument(cmdLine, maxScoreOnlyOption);
- return true;
- }
-
- private static boolean getBooleanArgument(CommandLine cmdLine, Option option) {
- return cmdLine.hasOption(option);
- }
-
- private static String getStringArgument(CommandLine cmdLine, Option inputFile) {
- return (String) cmdLine.getValue(inputFile);
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/RunLogistic.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/RunLogistic.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/RunLogistic.java
deleted file mode 100644
index 2d57016..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/RunLogistic.java
+++ /dev/null
@@ -1,163 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.classifier.sgd;
-
-import org.apache.commons.cli2.CommandLine;
-import org.apache.commons.cli2.Group;
-import org.apache.commons.cli2.Option;
-import org.apache.commons.cli2.builder.ArgumentBuilder;
-import org.apache.commons.cli2.builder.DefaultOptionBuilder;
-import org.apache.commons.cli2.builder.GroupBuilder;
-import org.apache.commons.cli2.commandline.Parser;
-import org.apache.commons.cli2.util.HelpFormatter;
-import org.apache.commons.io.Charsets;
-import org.apache.mahout.classifier.evaluation.Auc;
-import org.apache.mahout.math.Matrix;
-import org.apache.mahout.math.SequentialAccessSparseVector;
-import org.apache.mahout.math.Vector;
-
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.OutputStreamWriter;
-import java.io.PrintWriter;
-import java.util.Locale;
-
-public final class RunLogistic {
-
- private static String inputFile;
- private static String modelFile;
- private static boolean showAuc;
- private static boolean showScores;
- private static boolean showConfusion;
-
- private RunLogistic() {
- }
-
- public static void main(String[] args) throws Exception {
- mainToOutput(args, new PrintWriter(new OutputStreamWriter(System.out, Charsets.UTF_8), true));
- }
-
- static void mainToOutput(String[] args, PrintWriter output) throws Exception {
- if (parseArgs(args)) {
- if (!showAuc && !showConfusion && !showScores) {
- showAuc = true;
- showConfusion = true;
- }
-
- Auc collector = new Auc();
- LogisticModelParameters lmp = LogisticModelParameters.loadFrom(new File(modelFile));
-
- CsvRecordFactory csv = lmp.getCsvRecordFactory();
- OnlineLogisticRegression lr = lmp.createRegression();
- BufferedReader in = TrainLogistic.open(inputFile);
- String line = in.readLine();
- csv.firstLine(line);
- line = in.readLine();
- if (showScores) {
- output.println("\"target\",\"model-output\",\"log-likelihood\"");
- }
- while (line != null) {
- Vector v = new SequentialAccessSparseVector(lmp.getNumFeatures());
- int target = csv.processLine(line, v);
-
- double score = lr.classifyScalar(v);
- if (showScores) {
- output.printf(Locale.ENGLISH, "%d,%.3f,%.6f%n", target, score, lr.logLikelihood(target, v));
- }
- collector.add(target, score);
- line = in.readLine();
- }
-
- if (showAuc) {
- output.printf(Locale.ENGLISH, "AUC = %.2f%n", collector.auc());
- }
- if (showConfusion) {
- Matrix m = collector.confusion();
- output.printf(Locale.ENGLISH, "confusion: [[%.1f, %.1f], [%.1f, %.1f]]%n",
- m.get(0, 0), m.get(1, 0), m.get(0, 1), m.get(1, 1));
- m = collector.entropy();
- output.printf(Locale.ENGLISH, "entropy: [[%.1f, %.1f], [%.1f, %.1f]]%n",
- m.get(0, 0), m.get(1, 0), m.get(0, 1), m.get(1, 1));
- }
- }
- }
-
- private static boolean parseArgs(String[] args) {
- DefaultOptionBuilder builder = new DefaultOptionBuilder();
-
- Option help = builder.withLongName("help").withDescription("print this list").create();
-
- Option quiet = builder.withLongName("quiet").withDescription("be extra quiet").create();
-
- Option auc = builder.withLongName("auc").withDescription("print AUC").create();
- Option confusion = builder.withLongName("confusion").withDescription("print confusion matrix").create();
-
- Option scores = builder.withLongName("scores").withDescription("print scores").create();
-
- ArgumentBuilder argumentBuilder = new ArgumentBuilder();
- Option inputFileOption = builder.withLongName("input")
- .withRequired(true)
- .withArgument(argumentBuilder.withName("input").withMaximum(1).create())
- .withDescription("where to get training data")
- .create();
-
- Option modelFileOption = builder.withLongName("model")
- .withRequired(true)
- .withArgument(argumentBuilder.withName("model").withMaximum(1).create())
- .withDescription("where to get a model")
- .create();
-
- Group normalArgs = new GroupBuilder()
- .withOption(help)
- .withOption(quiet)
- .withOption(auc)
- .withOption(scores)
- .withOption(confusion)
- .withOption(inputFileOption)
- .withOption(modelFileOption)
- .create();
-
- Parser parser = new Parser();
- parser.setHelpOption(help);
- parser.setHelpTrigger("--help");
- parser.setGroup(normalArgs);
- parser.setHelpFormatter(new HelpFormatter(" ", "", " ", 130));
- CommandLine cmdLine = parser.parseAndHelp(args);
-
- if (cmdLine == null) {
- return false;
- }
-
- inputFile = getStringArgument(cmdLine, inputFileOption);
- modelFile = getStringArgument(cmdLine, modelFileOption);
- showAuc = getBooleanArgument(cmdLine, auc);
- showScores = getBooleanArgument(cmdLine, scores);
- showConfusion = getBooleanArgument(cmdLine, confusion);
-
- return true;
- }
-
- private static boolean getBooleanArgument(CommandLine cmdLine, Option option) {
- return cmdLine.hasOption(option);
- }
-
- private static String getStringArgument(CommandLine cmdLine, Option inputFile) {
- return (String) cmdLine.getValue(inputFile);
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/SGDHelper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/SGDHelper.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/SGDHelper.java
deleted file mode 100644
index c657803..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/SGDHelper.java
+++ /dev/null
@@ -1,151 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.classifier.sgd;
-
-import com.google.common.collect.Multiset;
-import org.apache.mahout.classifier.NewsgroupHelper;
-import org.apache.mahout.ep.State;
-import org.apache.mahout.math.Matrix;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.function.DoubleFunction;
-import org.apache.mahout.math.function.Functions;
-import org.apache.mahout.vectorizer.encoders.Dictionary;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Map;
-import java.util.Random;
-import java.util.Set;
-import java.util.TreeMap;
-
-public final class SGDHelper {
-
- private static final String[] LEAK_LABELS = {"none", "month-year", "day-month-year"};
-
- private SGDHelper() {
- }
-
- public static void dissect(int leakType,
- Dictionary dictionary,
- AdaptiveLogisticRegression learningAlgorithm,
- Iterable<File> files, Multiset<String> overallCounts) throws IOException {
- CrossFoldLearner model = learningAlgorithm.getBest().getPayload().getLearner();
- model.close();
-
- Map<String, Set<Integer>> traceDictionary = new TreeMap<>();
- ModelDissector md = new ModelDissector();
-
- NewsgroupHelper helper = new NewsgroupHelper();
- helper.getEncoder().setTraceDictionary(traceDictionary);
- helper.getBias().setTraceDictionary(traceDictionary);
-
- for (File file : permute(files, helper.getRandom()).subList(0, 500)) {
- String ng = file.getParentFile().getName();
- int actual = dictionary.intern(ng);
-
- traceDictionary.clear();
- Vector v = helper.encodeFeatureVector(file, actual, leakType, overallCounts);
- md.update(v, traceDictionary, model);
- }
-
- List<String> ngNames = new ArrayList<>(dictionary.values());
- List<ModelDissector.Weight> weights = md.summary(100);
- System.out.println("============");
- System.out.println("Model Dissection");
- for (ModelDissector.Weight w : weights) {
- System.out.printf("%s\t%.1f\t%s\t%.1f\t%s\t%.1f\t%s%n",
- w.getFeature(), w.getWeight(), ngNames.get(w.getMaxImpact() + 1),
- w.getCategory(1), w.getWeight(1), w.getCategory(2), w.getWeight(2));
- }
- }
-
- public static List<File> permute(Iterable<File> files, Random rand) {
- List<File> r = new ArrayList<>();
- for (File file : files) {
- int i = rand.nextInt(r.size() + 1);
- if (i == r.size()) {
- r.add(file);
- } else {
- r.add(r.get(i));
- r.set(i, file);
- }
- }
- return r;
- }
-
- static void analyzeState(SGDInfo info, int leakType, int k, State<AdaptiveLogisticRegression.Wrapper,
- CrossFoldLearner> best) throws IOException {
- int bump = info.getBumps()[(int) Math.floor(info.getStep()) % info.getBumps().length];
- int scale = (int) Math.pow(10, Math.floor(info.getStep() / info.getBumps().length));
- double maxBeta;
- double nonZeros;
- double positive;
- double norm;
-
- double lambda = 0;
- double mu = 0;
-
- if (best != null) {
- CrossFoldLearner state = best.getPayload().getLearner();
- info.setAverageCorrect(state.percentCorrect());
- info.setAverageLL(state.logLikelihood());
-
- OnlineLogisticRegression model = state.getModels().get(0);
- // finish off pending regularization
- model.close();
-
- Matrix beta = model.getBeta();
- maxBeta = beta.aggregate(Functions.MAX, Functions.ABS);
- nonZeros = beta.aggregate(Functions.PLUS, new DoubleFunction() {
- @Override
- public double apply(double v) {
- return Math.abs(v) > 1.0e-6 ? 1 : 0;
- }
- });
- positive = beta.aggregate(Functions.PLUS, new DoubleFunction() {
- @Override
- public double apply(double v) {
- return v > 0 ? 1 : 0;
- }
- });
- norm = beta.aggregate(Functions.PLUS, Functions.ABS);
-
- lambda = best.getMappedParams()[0];
- mu = best.getMappedParams()[1];
- } else {
- maxBeta = 0;
- nonZeros = 0;
- positive = 0;
- norm = 0;
- }
- if (k % (bump * scale) == 0) {
- if (best != null) {
- File modelFile = new File(System.getProperty("java.io.tmpdir"), "news-group-" + k + ".model");
- ModelSerializer.writeBinary(modelFile.getAbsolutePath(), best.getPayload().getLearner().getModels().get(0));
- }
-
- info.setStep(info.getStep() + 0.25);
- System.out.printf("%.2f\t%.2f\t%.2f\t%.2f\t%.8g\t%.8g\t", maxBeta, nonZeros, positive, norm, lambda, mu);
- System.out.printf("%d\t%.3f\t%.2f\t%s%n",
- k, info.getAverageLL(), info.getAverageCorrect() * 100, LEAK_LABELS[leakType % 3]);
- }
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/SGDInfo.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/SGDInfo.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/SGDInfo.java
deleted file mode 100644
index be55d43..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/SGDInfo.java
+++ /dev/null
@@ -1,59 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.classifier.sgd;
-
-final class SGDInfo {
-
- private double averageLL;
- private double averageCorrect;
- private double step;
- private int[] bumps = {1, 2, 5};
-
- double getAverageLL() {
- return averageLL;
- }
-
- void setAverageLL(double averageLL) {
- this.averageLL = averageLL;
- }
-
- double getAverageCorrect() {
- return averageCorrect;
- }
-
- void setAverageCorrect(double averageCorrect) {
- this.averageCorrect = averageCorrect;
- }
-
- double getStep() {
- return step;
- }
-
- void setStep(double step) {
- this.step = step;
- }
-
- int[] getBumps() {
- return bumps;
- }
-
- void setBumps(int[] bumps) {
- this.bumps = bumps;
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/SimpleCsvExamples.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/SimpleCsvExamples.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/SimpleCsvExamples.java
deleted file mode 100644
index b3da452..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/SimpleCsvExamples.java
+++ /dev/null
@@ -1,283 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.classifier.sgd;
-
-import com.google.common.base.Joiner;
-import com.google.common.base.Splitter;
-import com.google.common.collect.Lists;
-import com.google.common.io.Closeables;
-import com.google.common.io.Files;
-import org.apache.commons.io.Charsets;
-import org.apache.mahout.common.RandomUtils;
-import org.apache.mahout.math.DenseVector;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.list.IntArrayList;
-import org.apache.mahout.math.stats.OnlineSummarizer;
-import org.apache.mahout.vectorizer.encoders.ConstantValueEncoder;
-import org.apache.mahout.vectorizer.encoders.FeatureVectorEncoder;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.BufferedReader;
-import java.io.Closeable;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStreamWriter;
-import java.io.PrintWriter;
-import java.nio.ByteBuffer;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Random;
-
-/**
- * Shows how different encoding choices can make big speed differences.
- * <p/>
- * Run with command line options --generate 1000000 test.csv to generate a million data lines in
- * test.csv.
- * <p/>
- * Run with command line options --parser test.csv to time how long it takes to parse and encode
- * those million data points
- * <p/>
- * Run with command line options --fast test.csv to time how long it takes to parse and encode those
- * million data points using byte-level parsing and direct value encoding.
- * <p/>
- * This doesn't demonstrate text encoding which is subject to somewhat different tricks. The basic
- * idea of caching hash locations and byte level parsing still very much applies to text, however.
- */
-public final class SimpleCsvExamples {
-
- public static final char SEPARATOR_CHAR = '\t';
- private static final int FIELDS = 100;
-
- private static final Logger log = LoggerFactory.getLogger(SimpleCsvExamples.class);
-
- private SimpleCsvExamples() {}
-
- public static void main(String[] args) throws IOException {
- FeatureVectorEncoder[] encoder = new FeatureVectorEncoder[FIELDS];
- for (int i = 0; i < FIELDS; i++) {
- encoder[i] = new ConstantValueEncoder("v" + 1);
- }
-
- OnlineSummarizer[] s = new OnlineSummarizer[FIELDS];
- for (int i = 0; i < FIELDS; i++) {
- s[i] = new OnlineSummarizer();
- }
- long t0 = System.currentTimeMillis();
- Vector v = new DenseVector(1000);
- if ("--generate".equals(args[0])) {
- try (PrintWriter out =
- new PrintWriter(new OutputStreamWriter(new FileOutputStream(new File(args[2])), Charsets.UTF_8))) {
- int n = Integer.parseInt(args[1]);
- for (int i = 0; i < n; i++) {
- Line x = Line.generate();
- out.println(x);
- }
- }
- } else if ("--parse".equals(args[0])) {
- try (BufferedReader in = Files.newReader(new File(args[1]), Charsets.UTF_8)){
- String line = in.readLine();
- while (line != null) {
- v.assign(0);
- Line x = new Line(line);
- for (int i = 0; i < FIELDS; i++) {
- s[i].add(x.getDouble(i));
- encoder[i].addToVector(x.get(i), v);
- }
- line = in.readLine();
- }
- }
- String separator = "";
- for (int i = 0; i < FIELDS; i++) {
- System.out.printf("%s%.3f", separator, s[i].getMean());
- separator = ",";
- }
- } else if ("--fast".equals(args[0])) {
- try (FastLineReader in = new FastLineReader(new FileInputStream(args[1]))){
- FastLine line = in.read();
- while (line != null) {
- v.assign(0);
- for (int i = 0; i < FIELDS; i++) {
- double z = line.getDouble(i);
- s[i].add(z);
- encoder[i].addToVector((byte[]) null, z, v);
- }
- line = in.read();
- }
- }
-
- String separator = "";
- for (int i = 0; i < FIELDS; i++) {
- System.out.printf("%s%.3f", separator, s[i].getMean());
- separator = ",";
- }
- }
- System.out.printf("\nElapsed time = %.3f%n", (System.currentTimeMillis() - t0) / 1000.0);
- }
-
-
- private static final class Line {
- private static final Splitter ON_TABS = Splitter.on(SEPARATOR_CHAR).trimResults();
- public static final Joiner WITH_COMMAS = Joiner.on(SEPARATOR_CHAR);
-
- public static final Random RAND = RandomUtils.getRandom();
-
- private final List<String> data;
-
- private Line(CharSequence line) {
- data = Lists.newArrayList(ON_TABS.split(line));
- }
-
- private Line() {
- data = new ArrayList<>();
- }
-
- public double getDouble(int field) {
- return Double.parseDouble(data.get(field));
- }
-
- /**
- * Generate a random line with 20 fields each with integer values.
- *
- * @return A new line with data.
- */
- public static Line generate() {
- Line r = new Line();
- for (int i = 0; i < FIELDS; i++) {
- double mean = ((i + 1) * 257) % 50 + 1;
- r.data.add(Integer.toString(randomValue(mean)));
- }
- return r;
- }
-
- /**
- * Returns a random exponentially distributed integer with a particular mean value. This is
- * just a way to create more small numbers than big numbers.
- *
- * @param mean mean of the distribution
- * @return random exponentially distributed integer with the specific mean
- */
- private static int randomValue(double mean) {
- return (int) (-mean * Math.log1p(-RAND.nextDouble()));
- }
-
- @Override
- public String toString() {
- return WITH_COMMAS.join(data);
- }
-
- public String get(int field) {
- return data.get(field);
- }
- }
-
- private static final class FastLine {
-
- private final ByteBuffer base;
- private final IntArrayList start = new IntArrayList();
- private final IntArrayList length = new IntArrayList();
-
- private FastLine(ByteBuffer base) {
- this.base = base;
- }
-
- public static FastLine read(ByteBuffer buf) {
- FastLine r = new FastLine(buf);
- r.start.add(buf.position());
- int offset = buf.position();
- while (offset < buf.limit()) {
- int ch = buf.get();
- offset = buf.position();
- switch (ch) {
- case '\n':
- r.length.add(offset - r.start.get(r.length.size()) - 1);
- return r;
- case SEPARATOR_CHAR:
- r.length.add(offset - r.start.get(r.length.size()) - 1);
- r.start.add(offset);
- break;
- default:
- // nothing to do for now
- }
- }
- throw new IllegalArgumentException("Not enough bytes in buffer");
- }
-
- public double getDouble(int field) {
- int offset = start.get(field);
- int size = length.get(field);
- switch (size) {
- case 1:
- return base.get(offset) - '0';
- case 2:
- return (base.get(offset) - '0') * 10 + base.get(offset + 1) - '0';
- default:
- double r = 0;
- for (int i = 0; i < size; i++) {
- r = 10 * r + base.get(offset + i) - '0';
- }
- return r;
- }
- }
- }
-
- private static final class FastLineReader implements Closeable {
- private final InputStream in;
- private final ByteBuffer buf = ByteBuffer.allocate(100000);
-
- private FastLineReader(InputStream in) throws IOException {
- this.in = in;
- buf.limit(0);
- fillBuffer();
- }
-
- public FastLine read() throws IOException {
- fillBuffer();
- if (buf.remaining() > 0) {
- return FastLine.read(buf);
- } else {
- return null;
- }
- }
-
- private void fillBuffer() throws IOException {
- if (buf.remaining() < 10000) {
- buf.compact();
- int n = in.read(buf.array(), buf.position(), buf.remaining());
- if (n == -1) {
- buf.flip();
- } else {
- buf.limit(buf.position() + n);
- buf.position(0);
- }
- }
- }
-
- @Override
- public void close() {
- try {
- Closeables.close(in, true);
- } catch (IOException e) {
- log.error(e.getMessage(), e);
- }
- }
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/TestASFEmail.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/TestASFEmail.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/TestASFEmail.java
deleted file mode 100644
index 074f774..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/TestASFEmail.java
+++ /dev/null
@@ -1,152 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.classifier.sgd;
-
-import org.apache.commons.cli2.CommandLine;
-import org.apache.commons.cli2.Group;
-import org.apache.commons.cli2.Option;
-import org.apache.commons.cli2.builder.ArgumentBuilder;
-import org.apache.commons.cli2.builder.DefaultOptionBuilder;
-import org.apache.commons.cli2.builder.GroupBuilder;
-import org.apache.commons.cli2.commandline.Parser;
-import org.apache.commons.cli2.util.HelpFormatter;
-import org.apache.commons.io.Charsets;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.fs.PathFilter;
-import org.apache.hadoop.io.Text;
-import org.apache.mahout.classifier.ClassifierResult;
-import org.apache.mahout.classifier.ResultAnalyzer;
-import org.apache.mahout.common.Pair;
-import org.apache.mahout.common.iterator.sequencefile.PathType;
-import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterator;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.VectorWritable;
-import org.apache.mahout.vectorizer.encoders.Dictionary;
-
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.io.OutputStreamWriter;
-import java.io.PrintWriter;
-
-/**
- * Run the ASF email, as trained by TrainASFEmail
- */
-public final class TestASFEmail {
-
- private String inputFile;
- private String modelFile;
-
- private TestASFEmail() {}
-
- public static void main(String[] args) throws IOException {
- TestASFEmail runner = new TestASFEmail();
- if (runner.parseArgs(args)) {
- runner.run(new PrintWriter(new OutputStreamWriter(System.out, Charsets.UTF_8), true));
- }
- }
-
- public void run(PrintWriter output) throws IOException {
-
- File base = new File(inputFile);
- //contains the best model
- OnlineLogisticRegression classifier =
- ModelSerializer.readBinary(new FileInputStream(modelFile), OnlineLogisticRegression.class);
-
-
- Dictionary asfDictionary = new Dictionary();
- Configuration conf = new Configuration();
- PathFilter testFilter = new PathFilter() {
- @Override
- public boolean accept(Path path) {
- return path.getName().contains("test");
- }
- };
- SequenceFileDirIterator<Text, VectorWritable> iter =
- new SequenceFileDirIterator<>(new Path(base.toString()), PathType.LIST, testFilter,
- null, true, conf);
-
- long numItems = 0;
- while (iter.hasNext()) {
- Pair<Text, VectorWritable> next = iter.next();
- asfDictionary.intern(next.getFirst().toString());
- numItems++;
- }
-
- System.out.println(numItems + " test files");
- ResultAnalyzer ra = new ResultAnalyzer(asfDictionary.values(), "DEFAULT");
- iter = new SequenceFileDirIterator<>(new Path(base.toString()), PathType.LIST, testFilter,
- null, true, conf);
- while (iter.hasNext()) {
- Pair<Text, VectorWritable> next = iter.next();
- String ng = next.getFirst().toString();
-
- int actual = asfDictionary.intern(ng);
- Vector result = classifier.classifyFull(next.getSecond().get());
- int cat = result.maxValueIndex();
- double score = result.maxValue();
- double ll = classifier.logLikelihood(actual, next.getSecond().get());
- ClassifierResult cr = new ClassifierResult(asfDictionary.values().get(cat), score, ll);
- ra.addInstance(asfDictionary.values().get(actual), cr);
-
- }
- output.println(ra);
- }
-
- boolean parseArgs(String[] args) {
- DefaultOptionBuilder builder = new DefaultOptionBuilder();
-
- Option help = builder.withLongName("help").withDescription("print this list").create();
-
- ArgumentBuilder argumentBuilder = new ArgumentBuilder();
- Option inputFileOption = builder.withLongName("input")
- .withRequired(true)
- .withArgument(argumentBuilder.withName("input").withMaximum(1).create())
- .withDescription("where to get training data")
- .create();
-
- Option modelFileOption = builder.withLongName("model")
- .withRequired(true)
- .withArgument(argumentBuilder.withName("model").withMaximum(1).create())
- .withDescription("where to get a model")
- .create();
-
- Group normalArgs = new GroupBuilder()
- .withOption(help)
- .withOption(inputFileOption)
- .withOption(modelFileOption)
- .create();
-
- Parser parser = new Parser();
- parser.setHelpOption(help);
- parser.setHelpTrigger("--help");
- parser.setGroup(normalArgs);
- parser.setHelpFormatter(new HelpFormatter(" ", "", " ", 130));
- CommandLine cmdLine = parser.parseAndHelp(args);
-
- if (cmdLine == null) {
- return false;
- }
-
- inputFile = (String) cmdLine.getValue(inputFileOption);
- modelFile = (String) cmdLine.getValue(modelFileOption);
- return true;
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/TestNewsGroups.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/TestNewsGroups.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/TestNewsGroups.java
deleted file mode 100644
index f0316e9..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/TestNewsGroups.java
+++ /dev/null
@@ -1,141 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.classifier.sgd;
-
-import com.google.common.collect.HashMultiset;
-import com.google.common.collect.Multiset;
-import org.apache.commons.cli2.CommandLine;
-import org.apache.commons.cli2.Group;
-import org.apache.commons.cli2.Option;
-import org.apache.commons.cli2.builder.ArgumentBuilder;
-import org.apache.commons.cli2.builder.DefaultOptionBuilder;
-import org.apache.commons.cli2.builder.GroupBuilder;
-import org.apache.commons.cli2.commandline.Parser;
-import org.apache.commons.cli2.util.HelpFormatter;
-import org.apache.commons.io.Charsets;
-import org.apache.mahout.classifier.ClassifierResult;
-import org.apache.mahout.classifier.NewsgroupHelper;
-import org.apache.mahout.classifier.ResultAnalyzer;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.vectorizer.encoders.Dictionary;
-
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.io.OutputStreamWriter;
-import java.io.PrintWriter;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
-
-/**
- * Run the 20 news groups test data through SGD, as trained by {@link org.apache.mahout.classifier.sgd.TrainNewsGroups}.
- */
-public final class TestNewsGroups {
-
- private String inputFile;
- private String modelFile;
-
- private TestNewsGroups() {
- }
-
- public static void main(String[] args) throws IOException {
- TestNewsGroups runner = new TestNewsGroups();
- if (runner.parseArgs(args)) {
- runner.run(new PrintWriter(new OutputStreamWriter(System.out, Charsets.UTF_8), true));
- }
- }
-
- public void run(PrintWriter output) throws IOException {
-
- File base = new File(inputFile);
- //contains the best model
- OnlineLogisticRegression classifier =
- ModelSerializer.readBinary(new FileInputStream(modelFile), OnlineLogisticRegression.class);
-
- Dictionary newsGroups = new Dictionary();
- Multiset<String> overallCounts = HashMultiset.create();
-
- List<File> files = new ArrayList<>();
- for (File newsgroup : base.listFiles()) {
- if (newsgroup.isDirectory()) {
- newsGroups.intern(newsgroup.getName());
- files.addAll(Arrays.asList(newsgroup.listFiles()));
- }
- }
- System.out.println(files.size() + " test files");
- ResultAnalyzer ra = new ResultAnalyzer(newsGroups.values(), "DEFAULT");
- for (File file : files) {
- String ng = file.getParentFile().getName();
-
- int actual = newsGroups.intern(ng);
- NewsgroupHelper helper = new NewsgroupHelper();
- //no leak type ensures this is a normal vector
- Vector input = helper.encodeFeatureVector(file, actual, 0, overallCounts);
- Vector result = classifier.classifyFull(input);
- int cat = result.maxValueIndex();
- double score = result.maxValue();
- double ll = classifier.logLikelihood(actual, input);
- ClassifierResult cr = new ClassifierResult(newsGroups.values().get(cat), score, ll);
- ra.addInstance(newsGroups.values().get(actual), cr);
-
- }
- output.println(ra);
- }
-
- boolean parseArgs(String[] args) {
- DefaultOptionBuilder builder = new DefaultOptionBuilder();
-
- Option help = builder.withLongName("help").withDescription("print this list").create();
-
- ArgumentBuilder argumentBuilder = new ArgumentBuilder();
- Option inputFileOption = builder.withLongName("input")
- .withRequired(true)
- .withArgument(argumentBuilder.withName("input").withMaximum(1).create())
- .withDescription("where to get training data")
- .create();
-
- Option modelFileOption = builder.withLongName("model")
- .withRequired(true)
- .withArgument(argumentBuilder.withName("model").withMaximum(1).create())
- .withDescription("where to get a model")
- .create();
-
- Group normalArgs = new GroupBuilder()
- .withOption(help)
- .withOption(inputFileOption)
- .withOption(modelFileOption)
- .create();
-
- Parser parser = new Parser();
- parser.setHelpOption(help);
- parser.setHelpTrigger("--help");
- parser.setGroup(normalArgs);
- parser.setHelpFormatter(new HelpFormatter(" ", "", " ", 130));
- CommandLine cmdLine = parser.parseAndHelp(args);
-
- if (cmdLine == null) {
- return false;
- }
-
- inputFile = (String) cmdLine.getValue(inputFileOption);
- modelFile = (String) cmdLine.getValue(modelFileOption);
- return true;
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/TrainASFEmail.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/TrainASFEmail.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/TrainASFEmail.java
deleted file mode 100644
index e681f92..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/TrainASFEmail.java
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.classifier.sgd;
-
-import com.google.common.collect.HashMultiset;
-import com.google.common.collect.Multiset;
-import com.google.common.collect.Ordering;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.fs.PathFilter;
-import org.apache.hadoop.io.Text;
-import org.apache.mahout.common.AbstractJob;
-import org.apache.mahout.common.Pair;
-import org.apache.mahout.common.iterator.sequencefile.PathType;
-import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterator;
-import org.apache.mahout.ep.State;
-import org.apache.mahout.math.VectorWritable;
-import org.apache.mahout.vectorizer.encoders.Dictionary;
-
-import java.io.File;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
-
-public final class TrainASFEmail extends AbstractJob {
-
- private TrainASFEmail() {
- }
-
- @Override
- public int run(String[] args) throws Exception {
- addInputOption();
- addOutputOption();
- addOption("categories", "nc", "The number of categories to train on", true);
- addOption("cardinality", "c", "The size of the vectors to use", "100000");
- addOption("threads", "t", "The number of threads to use in the learner", "20");
- addOption("poolSize", "p", "The number of CrossFoldLearners to use in the AdaptiveLogisticRegression. "
- + "Higher values require more memory.", "5");
- if (parseArguments(args) == null) {
- return -1;
- }
-
- File base = new File(getInputPath().toString());
-
- Multiset<String> overallCounts = HashMultiset.create();
- File output = new File(getOutputPath().toString());
- output.mkdirs();
- int numCats = Integer.parseInt(getOption("categories"));
- int cardinality = Integer.parseInt(getOption("cardinality", "100000"));
- int threadCount = Integer.parseInt(getOption("threads", "20"));
- int poolSize = Integer.parseInt(getOption("poolSize", "5"));
- Dictionary asfDictionary = new Dictionary();
- AdaptiveLogisticRegression learningAlgorithm =
- new AdaptiveLogisticRegression(numCats, cardinality, new L1(), threadCount, poolSize);
- learningAlgorithm.setInterval(800);
- learningAlgorithm.setAveragingWindow(500);
-
- //We ran seq2encoded and split input already, so let's just build up the dictionary
- Configuration conf = new Configuration();
- PathFilter trainFilter = new PathFilter() {
- @Override
- public boolean accept(Path path) {
- return path.getName().contains("training");
- }
- };
- SequenceFileDirIterator<Text, VectorWritable> iter =
- new SequenceFileDirIterator<>(new Path(base.toString()), PathType.LIST, trainFilter, null, true, conf);
- long numItems = 0;
- while (iter.hasNext()) {
- Pair<Text, VectorWritable> next = iter.next();
- asfDictionary.intern(next.getFirst().toString());
- numItems++;
- }
-
- System.out.println(numItems + " training files");
-
- SGDInfo info = new SGDInfo();
-
- iter = new SequenceFileDirIterator<>(new Path(base.toString()), PathType.LIST, trainFilter,
- null, true, conf);
- int k = 0;
- while (iter.hasNext()) {
- Pair<Text, VectorWritable> next = iter.next();
- String ng = next.getFirst().toString();
- int actual = asfDictionary.intern(ng);
- //we already have encoded
- learningAlgorithm.train(actual, next.getSecond().get());
- k++;
- State<AdaptiveLogisticRegression.Wrapper, CrossFoldLearner> best = learningAlgorithm.getBest();
-
- SGDHelper.analyzeState(info, 0, k, best);
- }
- learningAlgorithm.close();
- //TODO: how to dissection since we aren't processing the files here
- //SGDHelper.dissect(leakType, asfDictionary, learningAlgorithm, files, overallCounts);
- System.out.println("exiting main, writing model to " + output);
-
- ModelSerializer.writeBinary(output + "/asf.model",
- learningAlgorithm.getBest().getPayload().getLearner().getModels().get(0));
-
- List<Integer> counts = new ArrayList<>();
- System.out.println("Word counts");
- for (String count : overallCounts.elementSet()) {
- counts.add(overallCounts.count(count));
- }
- Collections.sort(counts, Ordering.natural().reverse());
- k = 0;
- for (Integer count : counts) {
- System.out.println(k + "\t" + count);
- k++;
- if (k > 1000) {
- break;
- }
- }
- return 0;
- }
-
- public static void main(String[] args) throws Exception {
- TrainASFEmail trainer = new TrainASFEmail();
- trainer.run(args);
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/TrainAdaptiveLogistic.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/TrainAdaptiveLogistic.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/TrainAdaptiveLogistic.java
deleted file mode 100644
index defb5b9..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/TrainAdaptiveLogistic.java
+++ /dev/null
@@ -1,377 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.classifier.sgd;
-
-import com.google.common.io.Resources;
-import org.apache.commons.cli2.CommandLine;
-import org.apache.commons.cli2.Group;
-import org.apache.commons.cli2.Option;
-import org.apache.commons.cli2.builder.ArgumentBuilder;
-import org.apache.commons.cli2.builder.DefaultOptionBuilder;
-import org.apache.commons.cli2.builder.GroupBuilder;
-import org.apache.commons.cli2.commandline.Parser;
-import org.apache.commons.cli2.util.HelpFormatter;
-import org.apache.commons.io.Charsets;
-import org.apache.mahout.classifier.sgd.AdaptiveLogisticRegression.Wrapper;
-import org.apache.mahout.ep.State;
-import org.apache.mahout.math.RandomAccessSparseVector;
-import org.apache.mahout.math.Vector;
-
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.io.OutputStream;
-import java.io.OutputStreamWriter;
-import java.io.PrintWriter;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Locale;
-
-public final class TrainAdaptiveLogistic {
-
- private static String inputFile;
- private static String outputFile;
- private static AdaptiveLogisticModelParameters lmp;
- private static int passes;
- private static boolean showperf;
- private static int skipperfnum = 99;
- private static AdaptiveLogisticRegression model;
-
- private TrainAdaptiveLogistic() {
- }
-
- public static void main(String[] args) throws Exception {
- mainToOutput(args, new PrintWriter(new OutputStreamWriter(System.out, Charsets.UTF_8), true));
- }
-
- static void mainToOutput(String[] args, PrintWriter output) throws Exception {
- if (parseArgs(args)) {
-
- CsvRecordFactory csv = lmp.getCsvRecordFactory();
- model = lmp.createAdaptiveLogisticRegression();
- State<Wrapper, CrossFoldLearner> best;
- CrossFoldLearner learner = null;
-
- int k = 0;
- for (int pass = 0; pass < passes; pass++) {
- BufferedReader in = open(inputFile);
-
- // read variable names
- csv.firstLine(in.readLine());
-
- String line = in.readLine();
- while (line != null) {
- // for each new line, get target and predictors
- Vector input = new RandomAccessSparseVector(lmp.getNumFeatures());
- int targetValue = csv.processLine(line, input);
-
- // update model
- model.train(targetValue, input);
- k++;
-
- if (showperf && (k % (skipperfnum + 1) == 0)) {
-
- best = model.getBest();
- if (best != null) {
- learner = best.getPayload().getLearner();
- }
- if (learner != null) {
- double averageCorrect = learner.percentCorrect();
- double averageLL = learner.logLikelihood();
- output.printf("%d\t%.3f\t%.2f%n",
- k, averageLL, averageCorrect * 100);
- } else {
- output.printf(Locale.ENGLISH,
- "%10d %2d %s%n", k, targetValue,
- "AdaptiveLogisticRegression has not found a good model ......");
- }
- }
- line = in.readLine();
- }
- in.close();
- }
-
- best = model.getBest();
- if (best != null) {
- learner = best.getPayload().getLearner();
- }
- if (learner == null) {
- output.println("AdaptiveLogisticRegression has failed to train a model.");
- return;
- }
-
- try (OutputStream modelOutput = new FileOutputStream(outputFile)) {
- lmp.saveTo(modelOutput);
- }
-
- OnlineLogisticRegression lr = learner.getModels().get(0);
- output.println(lmp.getNumFeatures());
- output.println(lmp.getTargetVariable() + " ~ ");
- String sep = "";
- for (String v : csv.getTraceDictionary().keySet()) {
- double weight = predictorWeight(lr, 0, csv, v);
- if (weight != 0) {
- output.printf(Locale.ENGLISH, "%s%.3f*%s", sep, weight, v);
- sep = " + ";
- }
- }
- output.printf("%n");
-
- for (int row = 0; row < lr.getBeta().numRows(); row++) {
- for (String key : csv.getTraceDictionary().keySet()) {
- double weight = predictorWeight(lr, row, csv, key);
- if (weight != 0) {
- output.printf(Locale.ENGLISH, "%20s %.5f%n", key, weight);
- }
- }
- for (int column = 0; column < lr.getBeta().numCols(); column++) {
- output.printf(Locale.ENGLISH, "%15.9f ", lr.getBeta().get(row, column));
- }
- output.println();
- }
- }
-
- }
-
- private static double predictorWeight(OnlineLogisticRegression lr, int row, RecordFactory csv, String predictor) {
- double weight = 0;
- for (Integer column : csv.getTraceDictionary().get(predictor)) {
- weight += lr.getBeta().get(row, column);
- }
- return weight;
- }
-
- private static boolean parseArgs(String[] args) {
- DefaultOptionBuilder builder = new DefaultOptionBuilder();
-
- Option help = builder.withLongName("help")
- .withDescription("print this list").create();
-
- Option quiet = builder.withLongName("quiet")
- .withDescription("be extra quiet").create();
-
-
- ArgumentBuilder argumentBuilder = new ArgumentBuilder();
- Option showperf = builder
- .withLongName("showperf")
- .withDescription("output performance measures during training")
- .create();
-
- Option inputFile = builder
- .withLongName("input")
- .withRequired(true)
- .withArgument(
- argumentBuilder.withName("input").withMaximum(1)
- .create())
- .withDescription("where to get training data").create();
-
- Option outputFile = builder
- .withLongName("output")
- .withRequired(true)
- .withArgument(
- argumentBuilder.withName("output").withMaximum(1)
- .create())
- .withDescription("where to write the model content").create();
-
- Option threads = builder.withLongName("threads")
- .withArgument(
- argumentBuilder.withName("threads").withDefault("4").create())
- .withDescription("the number of threads AdaptiveLogisticRegression uses")
- .create();
-
-
- Option predictors = builder.withLongName("predictors")
- .withRequired(true)
- .withArgument(argumentBuilder.withName("predictors").create())
- .withDescription("a list of predictor variables").create();
-
- Option types = builder
- .withLongName("types")
- .withRequired(true)
- .withArgument(argumentBuilder.withName("types").create())
- .withDescription(
- "a list of predictor variable types (numeric, word, or text)")
- .create();
-
- Option target = builder
- .withLongName("target")
- .withDescription("the name of the target variable")
- .withRequired(true)
- .withArgument(
- argumentBuilder.withName("target").withMaximum(1)
- .create())
- .create();
-
- Option targetCategories = builder
- .withLongName("categories")
- .withDescription("the number of target categories to be considered")
- .withRequired(true)
- .withArgument(argumentBuilder.withName("categories").withMaximum(1).create())
- .create();
-
-
- Option features = builder
- .withLongName("features")
- .withDescription("the number of internal hashed features to use")
- .withArgument(
- argumentBuilder.withName("numFeatures")
- .withDefault("1000").withMaximum(1).create())
- .create();
-
- Option passes = builder
- .withLongName("passes")
- .withDescription("the number of times to pass over the input data")
- .withArgument(
- argumentBuilder.withName("passes").withDefault("2")
- .withMaximum(1).create())
- .create();
-
- Option interval = builder.withLongName("interval")
- .withArgument(
- argumentBuilder.withName("interval").withDefault("500").create())
- .withDescription("the interval property of AdaptiveLogisticRegression")
- .create();
-
- Option window = builder.withLongName("window")
- .withArgument(
- argumentBuilder.withName("window").withDefault("800").create())
- .withDescription("the average propery of AdaptiveLogisticRegression")
- .create();
-
- Option skipperfnum = builder.withLongName("skipperfnum")
- .withArgument(
- argumentBuilder.withName("skipperfnum").withDefault("99").create())
- .withDescription("show performance measures every (skipperfnum + 1) rows")
- .create();
-
- Option prior = builder.withLongName("prior")
- .withArgument(
- argumentBuilder.withName("prior").withDefault("L1").create())
- .withDescription("the prior algorithm to use: L1, L2, ebp, tp, up")
- .create();
-
- Option priorOption = builder.withLongName("prioroption")
- .withArgument(
- argumentBuilder.withName("prioroption").create())
- .withDescription("constructor parameter for ElasticBandPrior and TPrior")
- .create();
-
- Option auc = builder.withLongName("auc")
- .withArgument(
- argumentBuilder.withName("auc").withDefault("global").create())
- .withDescription("the auc to use: global or grouped")
- .create();
-
-
-
- Group normalArgs = new GroupBuilder().withOption(help)
- .withOption(quiet).withOption(inputFile).withOption(outputFile)
- .withOption(target).withOption(targetCategories)
- .withOption(predictors).withOption(types).withOption(passes)
- .withOption(interval).withOption(window).withOption(threads)
- .withOption(prior).withOption(features).withOption(showperf)
- .withOption(skipperfnum).withOption(priorOption).withOption(auc)
- .create();
-
- Parser parser = new Parser();
- parser.setHelpOption(help);
- parser.setHelpTrigger("--help");
- parser.setGroup(normalArgs);
- parser.setHelpFormatter(new HelpFormatter(" ", "", " ", 130));
- CommandLine cmdLine = parser.parseAndHelp(args);
-
- if (cmdLine == null) {
- return false;
- }
-
- TrainAdaptiveLogistic.inputFile = getStringArgument(cmdLine, inputFile);
- TrainAdaptiveLogistic.outputFile = getStringArgument(cmdLine,
- outputFile);
-
- List<String> typeList = new ArrayList<>();
- for (Object x : cmdLine.getValues(types)) {
- typeList.add(x.toString());
- }
-
- List<String> predictorList = new ArrayList<>();
- for (Object x : cmdLine.getValues(predictors)) {
- predictorList.add(x.toString());
- }
-
- lmp = new AdaptiveLogisticModelParameters();
- lmp.setTargetVariable(getStringArgument(cmdLine, target));
- lmp.setMaxTargetCategories(getIntegerArgument(cmdLine, targetCategories));
- lmp.setNumFeatures(getIntegerArgument(cmdLine, features));
- lmp.setInterval(getIntegerArgument(cmdLine, interval));
- lmp.setAverageWindow(getIntegerArgument(cmdLine, window));
- lmp.setThreads(getIntegerArgument(cmdLine, threads));
- lmp.setAuc(getStringArgument(cmdLine, auc));
- lmp.setPrior(getStringArgument(cmdLine, prior));
- if (cmdLine.getValue(priorOption) != null) {
- lmp.setPriorOption(getDoubleArgument(cmdLine, priorOption));
- }
- lmp.setTypeMap(predictorList, typeList);
- TrainAdaptiveLogistic.showperf = getBooleanArgument(cmdLine, showperf);
- TrainAdaptiveLogistic.skipperfnum = getIntegerArgument(cmdLine, skipperfnum);
- TrainAdaptiveLogistic.passes = getIntegerArgument(cmdLine, passes);
-
- lmp.checkParameters();
-
- return true;
- }
-
- private static String getStringArgument(CommandLine cmdLine,
- Option inputFile) {
- return (String) cmdLine.getValue(inputFile);
- }
-
- private static boolean getBooleanArgument(CommandLine cmdLine, Option option) {
- return cmdLine.hasOption(option);
- }
-
- private static int getIntegerArgument(CommandLine cmdLine, Option features) {
- return Integer.parseInt((String) cmdLine.getValue(features));
- }
-
- private static double getDoubleArgument(CommandLine cmdLine, Option op) {
- return Double.parseDouble((String) cmdLine.getValue(op));
- }
-
- public static AdaptiveLogisticRegression getModel() {
- return model;
- }
-
- public static LogisticModelParameters getParameters() {
- return lmp;
- }
-
- static BufferedReader open(String inputFile) throws IOException {
- InputStream in;
- try {
- in = Resources.getResource(inputFile).openStream();
- } catch (IllegalArgumentException e) {
- in = new FileInputStream(new File(inputFile));
- }
- return new BufferedReader(new InputStreamReader(in, Charsets.UTF_8));
- }
-
-}
r***@apache.org
2018-06-28 14:55:04 UTC
Permalink
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToPrefsDriver.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToPrefsDriver.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToPrefsDriver.java
new file mode 100644
index 0000000..752bb48
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToPrefsDriver.java
@@ -0,0 +1,274 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.email;
+
+import com.google.common.io.Closeables;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.filecache.DistributedCache;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.FileUtil;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
+import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
+import org.apache.mahout.math.VarIntWritable;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.net.URI;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.atomic.AtomicInteger;
+
+/**
+ * Convert the Mail archives (see {@link org.apache.mahout.text.SequenceFilesFromMailArchives}) to a preference
+ * file that can be consumed by the {@link org.apache.mahout.cf.taste.hadoop.item.RecommenderJob}.
+ * <p/>
+ * This assumes the input is a Sequence File, that the key is: filename/message id and the value is a list
+ * (separated by the user's choosing) containing the from email and any references
+ * <p/>
+ * The output is a matrix where either the from or to are the rows (represented as longs) and the columns are the
+ * message ids that the user has interacted with (as a VectorWritable). This class currently does not account for
+ * thread hijacking.
+ * <p/>
+ * It also outputs a side table mapping the row ids to their original and the message ids to the message thread id
+ */
+public final class MailToPrefsDriver extends AbstractJob {
+
+ private static final Logger log = LoggerFactory.getLogger(MailToPrefsDriver.class);
+
+ private static final String OUTPUT_FILES_PATTERN = "part-*";
+ private static final int DICTIONARY_BYTE_OVERHEAD = 4;
+
+ public static void main(String[] args) throws Exception {
+ ToolRunner.run(new Configuration(), new MailToPrefsDriver(), args);
+ }
+
+ @Override
+ public int run(String[] args) throws Exception {
+ addInputOption();
+ addOutputOption();
+ addOption(DefaultOptionCreator.overwriteOption().create());
+ addOption("chunkSize", "cs", "The size of chunks to write. Default is 100 mb", "100");
+ addOption("separator", "sep", "The separator used in the input file to separate to, from, subject. Default is \\n",
+ "\n");
+ addOption("from", "f", "The position in the input text (value) where the from email is located, starting from "
+ + "zero (0).", "0");
+ addOption("refs", "r", "The position in the input text (value) where the reference ids are located, "
+ + "starting from zero (0).", "1");
+ addOption(buildOption("useCounts", "u", "If set, then use the number of times the user has interacted with a "
+ + "thread as an indication of their preference. Otherwise, use boolean preferences.", false, false,
+ String.valueOf(true)));
+ Map<String, List<String>> parsedArgs = parseArguments(args);
+
+ Path input = getInputPath();
+ Path output = getOutputPath();
+ int chunkSize = Integer.parseInt(getOption("chunkSize"));
+ String separator = getOption("separator");
+ Configuration conf = getConf();
+ boolean useCounts = hasOption("useCounts");
+ AtomicInteger currentPhase = new AtomicInteger();
+ int[] msgDim = new int[1];
+ //TODO: mod this to not do so many passes over the data. Dictionary creation could probably be a chain mapper
+ List<Path> msgIdChunks = null;
+ boolean overwrite = hasOption(DefaultOptionCreator.OVERWRITE_OPTION);
+ // create the dictionary between message ids and longs
+ if (shouldRunNextPhase(parsedArgs, currentPhase)) {
+ //TODO: there seems to be a pattern emerging for dictionary creation
+ // -- sparse vectors from seq files also has this.
+ Path msgIdsPath = new Path(output, "msgIds");
+ if (overwrite) {
+ HadoopUtil.delete(conf, msgIdsPath);
+ }
+ log.info("Creating Msg Id Dictionary");
+ Job createMsgIdDictionary = prepareJob(input,
+ msgIdsPath,
+ SequenceFileInputFormat.class,
+ MsgIdToDictionaryMapper.class,
+ Text.class,
+ VarIntWritable.class,
+ MailToDictionaryReducer.class,
+ Text.class,
+ VarIntWritable.class,
+ SequenceFileOutputFormat.class);
+
+ boolean succeeded = createMsgIdDictionary.waitForCompletion(true);
+ if (!succeeded) {
+ return -1;
+ }
+ //write out the dictionary at the top level
+ msgIdChunks = createDictionaryChunks(msgIdsPath, output, "msgIds-dictionary-",
+ createMsgIdDictionary.getConfiguration(), chunkSize, msgDim);
+ }
+ //create the dictionary between from email addresses and longs
+ List<Path> fromChunks = null;
+ if (shouldRunNextPhase(parsedArgs, currentPhase)) {
+ Path fromIdsPath = new Path(output, "fromIds");
+ if (overwrite) {
+ HadoopUtil.delete(conf, fromIdsPath);
+ }
+ log.info("Creating From Id Dictionary");
+ Job createFromIdDictionary = prepareJob(input,
+ fromIdsPath,
+ SequenceFileInputFormat.class,
+ FromEmailToDictionaryMapper.class,
+ Text.class,
+ VarIntWritable.class,
+ MailToDictionaryReducer.class,
+ Text.class,
+ VarIntWritable.class,
+ SequenceFileOutputFormat.class);
+ createFromIdDictionary.getConfiguration().set(EmailUtility.SEPARATOR, separator);
+ boolean succeeded = createFromIdDictionary.waitForCompletion(true);
+ if (!succeeded) {
+ return -1;
+ }
+ //write out the dictionary at the top level
+ int[] fromDim = new int[1];
+ fromChunks = createDictionaryChunks(fromIdsPath, output, "fromIds-dictionary-",
+ createFromIdDictionary.getConfiguration(), chunkSize, fromDim);
+ }
+ //OK, we have our dictionaries, let's output the real thing we need: <from_id -> <msgId, msgId, msgId, ...>>
+ if (shouldRunNextPhase(parsedArgs, currentPhase) && fromChunks != null && msgIdChunks != null) {
+ //Job map
+ //may be a way to do this so that we can load the from ids in memory, if they are small enough so that
+ // we don't need the double loop
+ log.info("Creating recommendation matrix");
+ Path vecPath = new Path(output, "recInput");
+ if (overwrite) {
+ HadoopUtil.delete(conf, vecPath);
+ }
+ //conf.set(EmailUtility.FROM_DIMENSION, String.valueOf(fromDim[0]));
+ conf.set(EmailUtility.MSG_ID_DIMENSION, String.valueOf(msgDim[0]));
+ conf.set(EmailUtility.FROM_PREFIX, "fromIds-dictionary-");
+ conf.set(EmailUtility.MSG_IDS_PREFIX, "msgIds-dictionary-");
+ conf.set(EmailUtility.FROM_INDEX, getOption("from"));
+ conf.set(EmailUtility.REFS_INDEX, getOption("refs"));
+ conf.set(EmailUtility.SEPARATOR, separator);
+ conf.set(MailToRecReducer.USE_COUNTS_PREFERENCE, String.valueOf(useCounts));
+ int j = 0;
+ int i = 0;
+ for (Path fromChunk : fromChunks) {
+ for (Path idChunk : msgIdChunks) {
+ Path out = new Path(vecPath, "tmp-" + i + '-' + j);
+ DistributedCache.setCacheFiles(new URI[]{fromChunk.toUri(), idChunk.toUri()}, conf);
+ Job createRecMatrix = prepareJob(input, out, SequenceFileInputFormat.class,
+ MailToRecMapper.class, Text.class, LongWritable.class, MailToRecReducer.class, Text.class,
+ NullWritable.class, TextOutputFormat.class);
+ createRecMatrix.getConfiguration().set("mapred.output.compress", "false");
+ boolean succeeded = createRecMatrix.waitForCompletion(true);
+ if (!succeeded) {
+ return -1;
+ }
+ //copy the results up a level
+ //HadoopUtil.copyMergeSeqFiles(out.getFileSystem(conf), out, vecPath.getFileSystem(conf), outPath, true,
+ // conf, "");
+ FileStatus[] fs = HadoopUtil.getFileStatus(new Path(out, "*"), PathType.GLOB, PathFilters.partFilter(), null,
+ conf);
+ for (int k = 0; k < fs.length; k++) {
+ FileStatus f = fs[k];
+ Path outPath = new Path(vecPath, "chunk-" + i + '-' + j + '-' + k);
+ FileUtil.copy(f.getPath().getFileSystem(conf), f.getPath(), outPath.getFileSystem(conf), outPath, true,
+ overwrite, conf);
+ }
+ HadoopUtil.delete(conf, out);
+ j++;
+ }
+ i++;
+ }
+ //concat the files together
+ /*Path mergePath = new Path(output, "vectors.dat");
+ if (overwrite) {
+ HadoopUtil.delete(conf, mergePath);
+ }
+ log.info("Merging together output vectors to vectors.dat in {}", output);*/
+ //HadoopUtil.copyMergeSeqFiles(vecPath.getFileSystem(conf), vecPath, mergePath.getFileSystem(conf), mergePath,
+ // false, conf, "\n");
+ }
+
+ return 0;
+ }
+
+ private static List<Path> createDictionaryChunks(Path inputPath,
+ Path dictionaryPathBase,
+ String name,
+ Configuration baseConf,
+ int chunkSizeInMegabytes, int[] maxTermDimension)
+ throws IOException {
+ List<Path> chunkPaths = new ArrayList<>();
+
+ Configuration conf = new Configuration(baseConf);
+
+ FileSystem fs = FileSystem.get(inputPath.toUri(), conf);
+
+ long chunkSizeLimit = chunkSizeInMegabytes * 1024L * 1024L;
+ int chunkIndex = 0;
+ Path chunkPath = new Path(dictionaryPathBase, name + chunkIndex);
+ chunkPaths.add(chunkPath);
+
+ SequenceFile.Writer dictWriter = new SequenceFile.Writer(fs, conf, chunkPath, Text.class, IntWritable.class);
+
+ try {
+ long currentChunkSize = 0;
+ Path filesPattern = new Path(inputPath, OUTPUT_FILES_PATTERN);
+ int i = 1; //start at 1, since a miss in the OpenObjectIntHashMap returns a 0
+ for (Pair<Writable, Writable> record
+ : new SequenceFileDirIterable<>(filesPattern, PathType.GLOB, null, null, true, conf)) {
+ if (currentChunkSize > chunkSizeLimit) {
+ Closeables.close(dictWriter, false);
+ chunkIndex++;
+
+ chunkPath = new Path(dictionaryPathBase, name + chunkIndex);
+ chunkPaths.add(chunkPath);
+
+ dictWriter = new SequenceFile.Writer(fs, conf, chunkPath, Text.class, IntWritable.class);
+ currentChunkSize = 0;
+ }
+
+ Writable key = record.getFirst();
+ int fieldSize = DICTIONARY_BYTE_OVERHEAD + key.toString().length() * 2 + Integer.SIZE / 8;
+ currentChunkSize += fieldSize;
+ dictWriter.append(key, new IntWritable(i++));
+ }
+ maxTermDimension[0] = i;
+ } finally {
+ Closeables.close(dictWriter, false);
+ }
+
+ return chunkPaths;
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecMapper.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecMapper.java
new file mode 100644
index 0000000..91bbd17
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecMapper.java
@@ -0,0 +1,101 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.email;
+
+import org.apache.commons.lang3.StringUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.math.map.OpenObjectIntHashMap;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+
+public final class MailToRecMapper extends Mapper<Text, Text, Text, LongWritable> {
+
+ private static final Logger log = LoggerFactory.getLogger(MailToRecMapper.class);
+
+ private final OpenObjectIntHashMap<String> fromDictionary = new OpenObjectIntHashMap<>();
+ private final OpenObjectIntHashMap<String> msgIdDictionary = new OpenObjectIntHashMap<>();
+ private String separator = "\n";
+ private int fromIdx;
+ private int refsIdx;
+
+ public enum Counters {
+ REFERENCE, ORIGINAL
+ }
+
+ @Override
+ protected void setup(Context context) throws IOException, InterruptedException {
+ super.setup(context);
+ Configuration conf = context.getConfiguration();
+ String fromPrefix = conf.get(EmailUtility.FROM_PREFIX);
+ String msgPrefix = conf.get(EmailUtility.MSG_IDS_PREFIX);
+ fromIdx = conf.getInt(EmailUtility.FROM_INDEX, 0);
+ refsIdx = conf.getInt(EmailUtility.REFS_INDEX, 1);
+ EmailUtility.loadDictionaries(conf, fromPrefix, fromDictionary, msgPrefix, msgIdDictionary);
+ log.info("From Dictionary size: {} Msg Id Dictionary size: {}", fromDictionary.size(), msgIdDictionary.size());
+ separator = context.getConfiguration().get(EmailUtility.SEPARATOR);
+ }
+
+ @Override
+ protected void map(Text key, Text value, Context context) throws IOException, InterruptedException {
+
+ int msgIdKey = Integer.MIN_VALUE;
+
+
+ int fromKey = Integer.MIN_VALUE;
+ String valStr = value.toString();
+ String[] splits = StringUtils.splitByWholeSeparatorPreserveAllTokens(valStr, separator);
+
+ if (splits != null && splits.length > 0) {
+ if (splits.length > refsIdx) {
+ String from = EmailUtility.cleanUpEmailAddress(splits[fromIdx]);
+ fromKey = fromDictionary.get(from);
+ }
+ //get the references
+ if (splits.length > refsIdx) {
+ String[] theRefs = EmailUtility.parseReferences(splits[refsIdx]);
+ if (theRefs != null && theRefs.length > 0) {
+ //we have a reference, the first one is the original message id, so map to that one if it exists
+ msgIdKey = msgIdDictionary.get(theRefs[0]);
+ context.getCounter(Counters.REFERENCE).increment(1);
+ }
+ }
+ }
+ //we don't have any references, so use the msg id
+ if (msgIdKey == Integer.MIN_VALUE) {
+ //get the msg id and the from and output the associated ids
+ String keyStr = key.toString();
+ int idx = keyStr.lastIndexOf('/');
+ if (idx != -1) {
+ String msgId = keyStr.substring(idx + 1);
+ msgIdKey = msgIdDictionary.get(msgId);
+ context.getCounter(Counters.ORIGINAL).increment(1);
+ }
+ }
+
+ if (msgIdKey != Integer.MIN_VALUE && fromKey != Integer.MIN_VALUE) {
+ context.write(new Text(fromKey + "," + msgIdKey), new LongWritable(1));
+ }
+ }
+
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecReducer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecReducer.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecReducer.java
new file mode 100644
index 0000000..ee36a41
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecReducer.java
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.email;
+
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Reducer;
+
+import java.io.IOException;
+
+public class MailToRecReducer extends Reducer<Text, LongWritable, Text, NullWritable> {
+ //if true, then output weight
+ private boolean useCounts = true;
+ /**
+ * We can either ignore how many times the user interacted (boolean) or output the number of times they interacted.
+ */
+ public static final String USE_COUNTS_PREFERENCE = "useBooleanPreferences";
+
+ @Override
+ protected void setup(Context context) throws IOException, InterruptedException {
+ useCounts = context.getConfiguration().getBoolean(USE_COUNTS_PREFERENCE, true);
+ }
+
+ @Override
+ protected void reduce(Text key, Iterable<LongWritable> values, Context context)
+ throws IOException, InterruptedException {
+ if (useCounts) {
+ long sum = 0;
+ for (LongWritable value : values) {
+ sum++;
+ }
+ context.write(new Text(key.toString() + ',' + sum), null);
+ } else {
+ context.write(new Text(key.toString()), null);
+ }
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/email/MsgIdToDictionaryMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/email/MsgIdToDictionaryMapper.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/email/MsgIdToDictionaryMapper.java
new file mode 100644
index 0000000..f3de847
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/email/MsgIdToDictionaryMapper.java
@@ -0,0 +1,49 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.email;
+
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.math.VarIntWritable;
+
+import java.io.IOException;
+
+/**
+ * Assumes the input is in the format created by {@link org.apache.mahout.text.SequenceFilesFromMailArchives}
+ */
+public final class MsgIdToDictionaryMapper extends Mapper<Text, Text, Text, VarIntWritable> {
+
+ @Override
+ protected void map(Text key, Text value, Context context) throws IOException, InterruptedException {
+ //message id is in the key: /201008/AANLkTikvVnhNH+Y5AGEwqd2=***@mail.gmail.com
+ String keyStr = key.toString();
+ int idx = keyStr.lastIndexOf('@'); //find the last @
+ if (idx == -1) {
+ context.getCounter(EmailUtility.Counters.NO_MESSAGE_ID).increment(1);
+ } else {
+ //found the @, now find the last slash before the @ and grab everything after that
+ idx = keyStr.lastIndexOf('/', idx);
+ String msgId = keyStr.substring(idx + 1);
+ if (EmailUtility.WHITESPACE.matcher(msgId).matches()) {
+ context.getCounter(EmailUtility.Counters.NO_MESSAGE_ID).increment(1);
+ } else {
+ context.write(new Text(msgId), new VarIntWritable(1));
+ }
+ }
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/DataFileIterable.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/DataFileIterable.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/DataFileIterable.java
new file mode 100644
index 0000000..c358021
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/DataFileIterable.java
@@ -0,0 +1,44 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.kddcup;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Iterator;
+
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.common.Pair;
+
+public final class DataFileIterable implements Iterable<Pair<PreferenceArray,long[]>> {
+
+ private final File dataFile;
+
+ public DataFileIterable(File dataFile) {
+ this.dataFile = dataFile;
+ }
+
+ @Override
+ public Iterator<Pair<PreferenceArray, long[]>> iterator() {
+ try {
+ return new DataFileIterator(dataFile);
+ } catch (IOException ioe) {
+ throw new IllegalStateException(ioe);
+ }
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/DataFileIterator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/DataFileIterator.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/DataFileIterator.java
new file mode 100644
index 0000000..786e080
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/DataFileIterator.java
@@ -0,0 +1,158 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.kddcup;
+
+import java.io.Closeable;
+import java.io.File;
+import java.io.IOException;
+import java.util.regex.Pattern;
+
+import com.google.common.collect.AbstractIterator;
+import com.google.common.io.Closeables;
+import org.apache.mahout.cf.taste.impl.common.SkippingIterator;
+import org.apache.mahout.cf.taste.impl.model.GenericUserPreferenceArray;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.common.iterator.FileLineIterator;
+import org.apache.mahout.common.Pair;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * <p>An {@link java.util.Iterator} which iterates over any of the KDD Cup's rating files. These include the files
+ * {train,test,validation}Idx{1,2}}.txt. See http://kddcup.yahoo.com/. Each element in the iteration corresponds
+ * to one user's ratings as a {@link PreferenceArray} and corresponding timestamps as a parallel {@code long}
+ * array.</p>
+ *
+ * <p>Timestamps in the data set are relative to some unknown point in time, for anonymity. They are assumed
+ * to be relative to the epoch, time 0, or January 1 1970, for purposes here.</p>
+ */
+public final class DataFileIterator
+ extends AbstractIterator<Pair<PreferenceArray,long[]>>
+ implements SkippingIterator<Pair<PreferenceArray,long[]>>, Closeable {
+
+ private static final Pattern COLON_PATTERN = Pattern.compile(":");
+ private static final Pattern PIPE_PATTERN = Pattern.compile("\\|");
+ private static final Pattern TAB_PATTERN = Pattern.compile("\t");
+
+ private final FileLineIterator lineIterator;
+
+ private static final Logger log = LoggerFactory.getLogger(DataFileIterator.class);
+
+ public DataFileIterator(File dataFile) throws IOException {
+ if (dataFile == null || dataFile.isDirectory() || !dataFile.exists()) {
+ throw new IllegalArgumentException("Bad data file: " + dataFile);
+ }
+ lineIterator = new FileLineIterator(dataFile);
+ }
+
+ @Override
+ protected Pair<PreferenceArray, long[]> computeNext() {
+
+ if (!lineIterator.hasNext()) {
+ return endOfData();
+ }
+
+ String line = lineIterator.next();
+ // First a userID|ratingsCount line
+ String[] tokens = PIPE_PATTERN.split(line);
+
+ long userID = Long.parseLong(tokens[0]);
+ int ratingsLeftToRead = Integer.parseInt(tokens[1]);
+ int ratingsRead = 0;
+
+ PreferenceArray currentUserPrefs = new GenericUserPreferenceArray(ratingsLeftToRead);
+ long[] timestamps = new long[ratingsLeftToRead];
+
+ while (ratingsLeftToRead > 0) {
+
+ line = lineIterator.next();
+
+ // Then a data line. May be 1-4 tokens depending on whether preference info is included (it's not in test data)
+ // or whether date info is included (not inluded in track 2). Item ID is always first, and date is the last
+ // two fields if it exists.
+ tokens = TAB_PATTERN.split(line);
+ boolean hasPref = tokens.length == 2 || tokens.length == 4;
+ boolean hasDate = tokens.length > 2;
+
+ long itemID = Long.parseLong(tokens[0]);
+
+ currentUserPrefs.setUserID(0, userID);
+ currentUserPrefs.setItemID(ratingsRead, itemID);
+ if (hasPref) {
+ float preference = Float.parseFloat(tokens[1]);
+ currentUserPrefs.setValue(ratingsRead, preference);
+ }
+
+ if (hasDate) {
+ long timestamp;
+ if (hasPref) {
+ timestamp = parseFakeTimestamp(tokens[2], tokens[3]);
+ } else {
+ timestamp = parseFakeTimestamp(tokens[1], tokens[2]);
+ }
+ timestamps[ratingsRead] = timestamp;
+ }
+
+ ratingsRead++;
+ ratingsLeftToRead--;
+ }
+
+ return new Pair<>(currentUserPrefs, timestamps);
+ }
+
+ @Override
+ public void skip(int n) {
+ for (int i = 0; i < n; i++) {
+ if (lineIterator.hasNext()) {
+ String line = lineIterator.next();
+ // First a userID|ratingsCount line
+ String[] tokens = PIPE_PATTERN.split(line);
+ int linesToSKip = Integer.parseInt(tokens[1]);
+ lineIterator.skip(linesToSKip);
+ } else {
+ break;
+ }
+ }
+ }
+
+ @Override
+ public void close() {
+ endOfData();
+ try {
+ Closeables.close(lineIterator, true);
+ } catch (IOException e) {
+ log.error(e.getMessage(), e);
+ }
+ }
+
+ /**
+ * @param dateString "date" in days since some undisclosed date, which we will arbitrarily assume to be the
+ * epoch, January 1 1970.
+ * @param timeString time of day in HH:mm:ss format
+ * @return the UNIX timestamp for this moment in time
+ */
+ private static long parseFakeTimestamp(String dateString, CharSequence timeString) {
+ int days = Integer.parseInt(dateString);
+ String[] timeTokens = COLON_PATTERN.split(timeString);
+ int hours = Integer.parseInt(timeTokens[0]);
+ int minutes = Integer.parseInt(timeTokens[1]);
+ int seconds = Integer.parseInt(timeTokens[2]);
+ return 86400L * days + 3600L + hours + 60L * minutes + seconds;
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/KDDCupDataModel.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/KDDCupDataModel.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/KDDCupDataModel.java
new file mode 100644
index 0000000..4b62050
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/KDDCupDataModel.java
@@ -0,0 +1,231 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.kddcup;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Collection;
+import java.util.Iterator;
+
+import com.google.common.base.Preconditions;
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+import org.apache.mahout.cf.taste.impl.model.GenericDataModel;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.iterator.SamplingIterator;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * <p>An {@link DataModel} which reads into memory any of the KDD Cup's rating files; it is really
+ * meant for use with training data in the files trainIdx{1,2}}.txt.
+ * See http://kddcup.yahoo.com/.</p>
+ *
+ * <p>Timestamps in the data set are relative to some unknown point in time, for anonymity. They are assumed
+ * to be relative to the epoch, time 0, or January 1 1970, for purposes here.</p>
+ */
+public final class KDDCupDataModel implements DataModel {
+
+ private static final Logger log = LoggerFactory.getLogger(KDDCupDataModel.class);
+
+ private final File dataFileDirectory;
+ private final DataModel delegate;
+
+ /**
+ * @param dataFile training rating file
+ */
+ public KDDCupDataModel(File dataFile) throws IOException {
+ this(dataFile, false, 1.0);
+ }
+
+ /**
+ * @param dataFile training rating file
+ * @param storeDates if true, dates are parsed and stored, otherwise not
+ * @param samplingRate percentage of users to keep; can be used to reduce memory requirements
+ */
+ public KDDCupDataModel(File dataFile, boolean storeDates, double samplingRate) throws IOException {
+
+ Preconditions.checkArgument(!Double.isNaN(samplingRate) && samplingRate > 0.0 && samplingRate <= 1.0,
+ "Must be: 0.0 < samplingRate <= 1.0");
+
+ dataFileDirectory = dataFile.getParentFile();
+
+ Iterator<Pair<PreferenceArray,long[]>> dataIterator = new DataFileIterator(dataFile);
+ if (samplingRate < 1.0) {
+ dataIterator = new SamplingIterator<>(dataIterator, samplingRate);
+ }
+
+ FastByIDMap<PreferenceArray> userData = new FastByIDMap<>();
+ FastByIDMap<FastByIDMap<Long>> timestamps = new FastByIDMap<>();
+
+ while (dataIterator.hasNext()) {
+
+ Pair<PreferenceArray,long[]> pair = dataIterator.next();
+ PreferenceArray userPrefs = pair.getFirst();
+ long[] timestampsForPrefs = pair.getSecond();
+
+ userData.put(userPrefs.getUserID(0), userPrefs);
+ if (storeDates) {
+ FastByIDMap<Long> itemTimestamps = new FastByIDMap<>();
+ for (int i = 0; i < timestampsForPrefs.length; i++) {
+ long timestamp = timestampsForPrefs[i];
+ if (timestamp > 0L) {
+ itemTimestamps.put(userPrefs.getItemID(i), timestamp);
+ }
+ }
+ }
+
+ }
+
+ if (storeDates) {
+ delegate = new GenericDataModel(userData, timestamps);
+ } else {
+ delegate = new GenericDataModel(userData);
+ }
+
+ Runtime runtime = Runtime.getRuntime();
+ log.info("Loaded data model in about {}MB heap", (runtime.totalMemory() - runtime.freeMemory()) / 1000000);
+ }
+
+ public File getDataFileDirectory() {
+ return dataFileDirectory;
+ }
+
+ public static File getTrainingFile(File dataFileDirectory) {
+ return getFile(dataFileDirectory, "trainIdx");
+ }
+
+ public static File getValidationFile(File dataFileDirectory) {
+ return getFile(dataFileDirectory, "validationIdx");
+ }
+
+ public static File getTestFile(File dataFileDirectory) {
+ return getFile(dataFileDirectory, "testIdx");
+ }
+
+ public static File getTrackFile(File dataFileDirectory) {
+ return getFile(dataFileDirectory, "trackData");
+ }
+
+ private static File getFile(File dataFileDirectory, String prefix) {
+ // Works on set 1 or 2
+ for (int set : new int[] {1,2}) {
+ // Works on sample data from before contest or real data
+ for (String firstLinesOrNot : new String[] {"", ".firstLines"}) {
+ for (String gzippedOrNot : new String[] {".gz", ""}) {
+ File dataFile = new File(dataFileDirectory, prefix + set + firstLinesOrNot + ".txt" + gzippedOrNot);
+ if (dataFile.exists()) {
+ return dataFile;
+ }
+ }
+ }
+ }
+ throw new IllegalArgumentException("Can't find " + prefix + " file in " + dataFileDirectory);
+ }
+
+ @Override
+ public LongPrimitiveIterator getUserIDs() throws TasteException {
+ return delegate.getUserIDs();
+ }
+
+ @Override
+ public PreferenceArray getPreferencesFromUser(long userID) throws TasteException {
+ return delegate.getPreferencesFromUser(userID);
+ }
+
+ @Override
+ public FastIDSet getItemIDsFromUser(long userID) throws TasteException {
+ return delegate.getItemIDsFromUser(userID);
+ }
+
+ @Override
+ public LongPrimitiveIterator getItemIDs() throws TasteException {
+ return delegate.getItemIDs();
+ }
+
+ @Override
+ public PreferenceArray getPreferencesForItem(long itemID) throws TasteException {
+ return delegate.getPreferencesForItem(itemID);
+ }
+
+ @Override
+ public Float getPreferenceValue(long userID, long itemID) throws TasteException {
+ return delegate.getPreferenceValue(userID, itemID);
+ }
+
+ @Override
+ public Long getPreferenceTime(long userID, long itemID) throws TasteException {
+ return delegate.getPreferenceTime(userID, itemID);
+ }
+
+ @Override
+ public int getNumItems() throws TasteException {
+ return delegate.getNumItems();
+ }
+
+ @Override
+ public int getNumUsers() throws TasteException {
+ return delegate.getNumUsers();
+ }
+
+ @Override
+ public int getNumUsersWithPreferenceFor(long itemID) throws TasteException {
+ return delegate.getNumUsersWithPreferenceFor(itemID);
+ }
+
+ @Override
+ public int getNumUsersWithPreferenceFor(long itemID1, long itemID2) throws TasteException {
+ return delegate.getNumUsersWithPreferenceFor(itemID1, itemID2);
+ }
+
+ @Override
+ public void setPreference(long userID, long itemID, float value) throws TasteException {
+ delegate.setPreference(userID, itemID, value);
+ }
+
+ @Override
+ public void removePreference(long userID, long itemID) throws TasteException {
+ delegate.removePreference(userID, itemID);
+ }
+
+ @Override
+ public boolean hasPreferenceValues() {
+ return delegate.hasPreferenceValues();
+ }
+
+ @Override
+ public float getMaxPreference() {
+ return 100.0f;
+ }
+
+ @Override
+ public float getMinPreference() {
+ return 0.0f;
+ }
+
+ @Override
+ public void refresh(Collection<Refreshable> alreadyRefreshed) {
+ // do nothing
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/ToCSV.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/ToCSV.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/ToCSV.java
new file mode 100644
index 0000000..3f4a732
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/ToCSV.java
@@ -0,0 +1,77 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.kddcup;
+
+import org.apache.commons.io.Charsets;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.common.Pair;
+
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.OutputStream;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.util.zip.GZIPOutputStream;
+
+/**
+ * <p>This class converts a KDD Cup input file into a compressed CSV format. The output format is
+ * {@code userID,itemID,score,timestamp}. It can optionally restrict its output to exclude
+ * score and/or timestamp.</p>
+ *
+ * <p>Run as: {@code ToCSV (input file) (output file) [num columns to output]}</p>
+ */
+public final class ToCSV {
+
+ private ToCSV() {
+ }
+
+ public static void main(String[] args) throws Exception {
+
+ File inputFile = new File(args[0]);
+ File outputFile = new File(args[1]);
+ int columnsToOutput = 4;
+ if (args.length >= 3) {
+ columnsToOutput = Integer.parseInt(args[2]);
+ }
+
+ OutputStream outStream = new GZIPOutputStream(new FileOutputStream(outputFile));
+
+ try (Writer outWriter = new BufferedWriter(new OutputStreamWriter(outStream, Charsets.UTF_8))){
+ for (Pair<PreferenceArray,long[]> user : new DataFileIterable(inputFile)) {
+ PreferenceArray prefs = user.getFirst();
+ long[] timestamps = user.getSecond();
+ for (int i = 0; i < prefs.length(); i++) {
+ outWriter.write(String.valueOf(prefs.getUserID(i)));
+ outWriter.write(',');
+ outWriter.write(String.valueOf(prefs.getItemID(i)));
+ if (columnsToOutput > 2) {
+ outWriter.write(',');
+ outWriter.write(String.valueOf(prefs.getValue(i)));
+ }
+ if (columnsToOutput > 3) {
+ outWriter.write(',');
+ outWriter.write(String.valueOf(timestamps[i]));
+ }
+ outWriter.write('\n');
+ }
+ }
+ }
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/EstimateConverter.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/EstimateConverter.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/EstimateConverter.java
new file mode 100644
index 0000000..0112ab9
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/EstimateConverter.java
@@ -0,0 +1,43 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.kddcup.track1;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public final class EstimateConverter {
+
+ private static final Logger log = LoggerFactory.getLogger(EstimateConverter.class);
+
+ private EstimateConverter() {}
+
+ public static byte convert(double estimate, long userID, long itemID) {
+ if (Double.isNaN(estimate)) {
+ log.warn("Unable to compute estimate for user {}, item {}", userID, itemID);
+ return 0x7F;
+ } else {
+ int scaledEstimate = (int) (estimate * 2.55);
+ if (scaledEstimate > 255) {
+ scaledEstimate = 255;
+ } else if (scaledEstimate < 0) {
+ scaledEstimate = 0;
+ }
+ return (byte) scaledEstimate;
+ }
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1Callable.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1Callable.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1Callable.java
new file mode 100644
index 0000000..72056da
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1Callable.java
@@ -0,0 +1,67 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.kddcup.track1;
+
+import java.util.concurrent.Callable;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.apache.mahout.cf.taste.common.NoSuchItemException;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.cf.taste.recommender.Recommender;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+final class Track1Callable implements Callable<byte[]> {
+
+ private static final Logger log = LoggerFactory.getLogger(Track1Callable.class);
+ private static final AtomicInteger COUNT = new AtomicInteger();
+
+ private final Recommender recommender;
+ private final PreferenceArray userTest;
+
+ Track1Callable(Recommender recommender, PreferenceArray userTest) {
+ this.recommender = recommender;
+ this.userTest = userTest;
+ }
+
+ @Override
+ public byte[] call() throws TasteException {
+ long userID = userTest.get(0).getUserID();
+ byte[] result = new byte[userTest.length()];
+ for (int i = 0; i < userTest.length(); i++) {
+ long itemID = userTest.getItemID(i);
+ double estimate;
+ try {
+ estimate = recommender.estimatePreference(userID, itemID);
+ } catch (NoSuchItemException nsie) {
+ // OK in the sample data provided before the contest, should never happen otherwise
+ log.warn("Unknown item {}; OK unless this is the real contest data", itemID);
+ continue;
+ }
+ result[i] = EstimateConverter.convert(estimate, userID, itemID);
+ }
+
+ if (COUNT.incrementAndGet() % 10000 == 0) {
+ log.info("Completed {} users", COUNT.get());
+ }
+
+ return result;
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1Recommender.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1Recommender.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1Recommender.java
new file mode 100644
index 0000000..067daf5
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1Recommender.java
@@ -0,0 +1,94 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.kddcup.track1;
+
+import java.util.Collection;
+import java.util.List;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.recommender.GenericItemBasedRecommender;
+import org.apache.mahout.cf.taste.impl.similarity.UncenteredCosineSimilarity;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.recommender.IDRescorer;
+import org.apache.mahout.cf.taste.recommender.RecommendedItem;
+import org.apache.mahout.cf.taste.recommender.Recommender;
+import org.apache.mahout.cf.taste.similarity.ItemSimilarity;
+
+public final class Track1Recommender implements Recommender {
+
+ private final Recommender recommender;
+
+ public Track1Recommender(DataModel dataModel) throws TasteException {
+ // Change this to whatever you like!
+ ItemSimilarity similarity = new UncenteredCosineSimilarity(dataModel);
+ recommender = new GenericItemBasedRecommender(dataModel, similarity);
+ }
+
+ @Override
+ public List<RecommendedItem> recommend(long userID, int howMany) throws TasteException {
+ return recommender.recommend(userID, howMany);
+ }
+
+ @Override
+ public List<RecommendedItem> recommend(long userID, int howMany, boolean includeKnownItems) throws TasteException {
+ return recommend(userID, howMany, null, includeKnownItems);
+ }
+
+ @Override
+ public List<RecommendedItem> recommend(long userID, int howMany, IDRescorer rescorer) throws TasteException {
+ return recommender.recommend(userID, howMany, rescorer, false);
+ }
+
+ @Override
+ public List<RecommendedItem> recommend(long userID, int howMany, IDRescorer rescorer, boolean includeKnownItems)
+ throws TasteException {
+ return recommender.recommend(userID, howMany, rescorer, includeKnownItems);
+ }
+
+ @Override
+ public float estimatePreference(long userID, long itemID) throws TasteException {
+ return recommender.estimatePreference(userID, itemID);
+ }
+
+ @Override
+ public void setPreference(long userID, long itemID, float value) throws TasteException {
+ recommender.setPreference(userID, itemID, value);
+ }
+
+ @Override
+ public void removePreference(long userID, long itemID) throws TasteException {
+ recommender.removePreference(userID, itemID);
+ }
+
+ @Override
+ public DataModel getDataModel() {
+ return recommender.getDataModel();
+ }
+
+ @Override
+ public void refresh(Collection<Refreshable> alreadyRefreshed) {
+ recommender.refresh(alreadyRefreshed);
+ }
+
+ @Override
+ public String toString() {
+ return "Track1Recommender[recommender:" + recommender + ']';
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1RecommenderBuilder.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1RecommenderBuilder.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1RecommenderBuilder.java
new file mode 100644
index 0000000..6b9fe1b
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1RecommenderBuilder.java
@@ -0,0 +1,32 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.kddcup.track1;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.eval.RecommenderBuilder;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.recommender.Recommender;
+
+final class Track1RecommenderBuilder implements RecommenderBuilder {
+
+ @Override
+ public Recommender buildRecommender(DataModel dataModel) throws TasteException {
+ return new Track1Recommender(dataModel);
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1RecommenderEvaluator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1RecommenderEvaluator.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1RecommenderEvaluator.java
new file mode 100644
index 0000000..bcd0a3d
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1RecommenderEvaluator.java
@@ -0,0 +1,108 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.kddcup.track1;
+
+import java.io.File;
+import java.util.Collection;
+import java.util.concurrent.Callable;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import com.google.common.collect.Lists;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.eval.DataModelBuilder;
+import org.apache.mahout.cf.taste.eval.RecommenderBuilder;
+import org.apache.mahout.cf.taste.example.kddcup.DataFileIterable;
+import org.apache.mahout.cf.taste.example.kddcup.KDDCupDataModel;
+import org.apache.mahout.cf.taste.impl.common.FullRunningAverage;
+import org.apache.mahout.cf.taste.impl.common.FullRunningAverageAndStdDev;
+import org.apache.mahout.cf.taste.impl.common.RunningAverage;
+import org.apache.mahout.cf.taste.impl.common.RunningAverageAndStdDev;
+import org.apache.mahout.cf.taste.impl.eval.AbstractDifferenceRecommenderEvaluator;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.Preference;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.cf.taste.recommender.Recommender;
+import org.apache.mahout.common.Pair;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Attempts to run an evaluation just like that dictated for Yahoo's KDD Cup, Track 1.
+ * It will compute the RMSE of a validation data set against the predicted ratings from
+ * the training data set.
+ */
+public final class Track1RecommenderEvaluator extends AbstractDifferenceRecommenderEvaluator {
+
+ private static final Logger log = LoggerFactory.getLogger(Track1RecommenderEvaluator.class);
+
+ private RunningAverage average;
+ private final File dataFileDirectory;
+
+ public Track1RecommenderEvaluator(File dataFileDirectory) {
+ setMaxPreference(100.0f);
+ setMinPreference(0.0f);
+ average = new FullRunningAverage();
+ this.dataFileDirectory = dataFileDirectory;
+ }
+
+ @Override
+ public double evaluate(RecommenderBuilder recommenderBuilder,
+ DataModelBuilder dataModelBuilder,
+ DataModel dataModel,
+ double trainingPercentage,
+ double evaluationPercentage) throws TasteException {
+
+ Recommender recommender = recommenderBuilder.buildRecommender(dataModel);
+
+ Collection<Callable<Void>> estimateCallables = Lists.newArrayList();
+ AtomicInteger noEstimateCounter = new AtomicInteger();
+ for (Pair<PreferenceArray,long[]> userData
+ : new DataFileIterable(KDDCupDataModel.getValidationFile(dataFileDirectory))) {
+ PreferenceArray validationPrefs = userData.getFirst();
+ long userID = validationPrefs.get(0).getUserID();
+ estimateCallables.add(
+ new PreferenceEstimateCallable(recommender, userID, validationPrefs, noEstimateCounter));
+ }
+
+ RunningAverageAndStdDev timing = new FullRunningAverageAndStdDev();
+ execute(estimateCallables, noEstimateCounter, timing);
+
+ double result = computeFinalEvaluation();
+ log.info("Evaluation result: {}", result);
+ return result;
+ }
+
+ // Use RMSE scoring:
+
+ @Override
+ protected void reset() {
+ average = new FullRunningAverage();
+ }
+
+ @Override
+ protected void processOneEstimate(float estimatedPreference, Preference realPref) {
+ double diff = realPref.getValue() - estimatedPreference;
+ average.addDatum(diff * diff);
+ }
+
+ @Override
+ protected double computeFinalEvaluation() {
+ return Math.sqrt(average.getAverage());
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1RecommenderEvaluatorRunner.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1RecommenderEvaluatorRunner.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1RecommenderEvaluatorRunner.java
new file mode 100644
index 0000000..deadc00
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1RecommenderEvaluatorRunner.java
@@ -0,0 +1,56 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.kddcup.track1;
+
+import java.io.File;
+import java.io.IOException;
+
+import org.apache.commons.cli2.OptionException;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.example.TasteOptionParser;
+import org.apache.mahout.cf.taste.example.kddcup.KDDCupDataModel;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public final class Track1RecommenderEvaluatorRunner {
+
+ private static final Logger log = LoggerFactory.getLogger(Track1RecommenderEvaluatorRunner.class);
+
+ private Track1RecommenderEvaluatorRunner() {
+ }
+
+ public static void main(String... args) throws IOException, TasteException, OptionException {
+ File dataFileDirectory = TasteOptionParser.getRatings(args);
+ if (dataFileDirectory == null) {
+ throw new IllegalArgumentException("No data directory");
+ }
+ if (!dataFileDirectory.exists() || !dataFileDirectory.isDirectory()) {
+ throw new IllegalArgumentException("Bad data file directory: " + dataFileDirectory);
+ }
+ Track1RecommenderEvaluator evaluator = new Track1RecommenderEvaluator(dataFileDirectory);
+ DataModel model = new KDDCupDataModel(KDDCupDataModel.getTrainingFile(dataFileDirectory));
+ double evaluation = evaluator.evaluate(new Track1RecommenderBuilder(),
+ null,
+ model,
+ Float.NaN,
+ Float.NaN);
+ log.info(String.valueOf(evaluation));
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1Runner.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1Runner.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1Runner.java
new file mode 100644
index 0000000..a0ff126
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1Runner.java
@@ -0,0 +1,95 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.kddcup.track1;
+
+import org.apache.mahout.cf.taste.example.kddcup.DataFileIterable;
+import org.apache.mahout.cf.taste.example.kddcup.KDDCupDataModel;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.common.Pair;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.BufferedOutputStream;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.OutputStream;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+
+/**
+ * <p>Runs "track 1" of the KDD Cup competition using whatever recommender is inside {@link Track1Recommender}
+ * and attempts to output the result in the correct contest format.</p>
+ *
+ * <p>Run as: {@code Track1Runner [track 1 data file directory] [output file]}</p>
+ */
+public final class Track1Runner {
+
+ private static final Logger log = LoggerFactory.getLogger(Track1Runner.class);
+
+ private Track1Runner() {
+ }
+
+ public static void main(String[] args) throws Exception {
+
+ File dataFileDirectory = new File(args[0]);
+ if (!dataFileDirectory.exists() || !dataFileDirectory.isDirectory()) {
+ throw new IllegalArgumentException("Bad data file directory: " + dataFileDirectory);
+ }
+
+ long start = System.currentTimeMillis();
+
+ KDDCupDataModel model = new KDDCupDataModel(KDDCupDataModel.getTrainingFile(dataFileDirectory));
+ Track1Recommender recommender = new Track1Recommender(model);
+
+ long end = System.currentTimeMillis();
+ log.info("Loaded model in {}s", (end - start) / 1000);
+ start = end;
+
+ Collection<Track1Callable> callables = new ArrayList<>();
+ for (Pair<PreferenceArray,long[]> tests : new DataFileIterable(KDDCupDataModel.getTestFile(dataFileDirectory))) {
+ PreferenceArray userTest = tests.getFirst();
+ callables.add(new Track1Callable(recommender, userTest));
+ }
+
+ int cores = Runtime.getRuntime().availableProcessors();
+ log.info("Running on {} cores", cores);
+ ExecutorService executor = Executors.newFixedThreadPool(cores);
+ List<Future<byte[]>> results = executor.invokeAll(callables);
+ executor.shutdown();
+
+ end = System.currentTimeMillis();
+ log.info("Ran recommendations in {}s", (end - start) / 1000);
+ start = end;
+
+ try (OutputStream out = new BufferedOutputStream(new FileOutputStream(new File(args[1])))){
+ for (Future<byte[]> result : results) {
+ for (byte estimate : result.get()) {
+ out.write(estimate);
+ }
+ }
+ }
+
+ end = System.currentTimeMillis();
+ log.info("Wrote output in {}s", (end - start) / 1000);
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/DataModelFactorizablePreferences.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/DataModelFactorizablePreferences.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/DataModelFactorizablePreferences.java
new file mode 100644
index 0000000..022d78c
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/DataModelFactorizablePreferences.java
@@ -0,0 +1,107 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.kddcup.track1.svd;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+import org.apache.mahout.cf.taste.impl.model.GenericPreference;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.Preference;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * can be used to drop {@link DataModel}s into {@link ParallelArraysSGDFactorizer}
+ */
+public class DataModelFactorizablePreferences implements FactorizablePreferences {
+
+ private final FastIDSet userIDs;
+ private final FastIDSet itemIDs;
+
+ private final List<Preference> preferences;
+
+ private final float minPreference;
+ private final float maxPreference;
+
+ public DataModelFactorizablePreferences(DataModel dataModel) {
+
+ minPreference = dataModel.getMinPreference();
+ maxPreference = dataModel.getMaxPreference();
+
+ try {
+ userIDs = new FastIDSet(dataModel.getNumUsers());
+ itemIDs = new FastIDSet(dataModel.getNumItems());
+ preferences = new ArrayList<>();
+
+ LongPrimitiveIterator userIDsIterator = dataModel.getUserIDs();
+ while (userIDsIterator.hasNext()) {
+ long userID = userIDsIterator.nextLong();
+ userIDs.add(userID);
+ for (Preference preference : dataModel.getPreferencesFromUser(userID)) {
+ itemIDs.add(preference.getItemID());
+ preferences.add(new GenericPreference(userID, preference.getItemID(), preference.getValue()));
+ }
+ }
+ } catch (TasteException te) {
+ throw new IllegalStateException("Unable to create factorizable preferences!", te);
+ }
+ }
+
+ @Override
+ public LongPrimitiveIterator getUserIDs() {
+ return userIDs.iterator();
+ }
+
+ @Override
+ public LongPrimitiveIterator getItemIDs() {
+ return itemIDs.iterator();
+ }
+
+ @Override
+ public Iterable<Preference> getPreferences() {
+ return preferences;
+ }
+
+ @Override
+ public float getMinPreference() {
+ return minPreference;
+ }
+
+ @Override
+ public float getMaxPreference() {
+ return maxPreference;
+ }
+
+ @Override
+ public int numUsers() {
+ return userIDs.size();
+ }
+
+ @Override
+ public int numItems() {
+ return itemIDs.size();
+ }
+
+ @Override
+ public int numPreferences() {
+ return preferences.size();
+ }
+}
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/FactorizablePreferences.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/FactorizablePreferences.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/FactorizablePreferences.java
new file mode 100644
index 0000000..a126dec
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/FactorizablePreferences.java
@@ -0,0 +1,44 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.kddcup.track1.svd;
+
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+import org.apache.mahout.cf.taste.model.Preference;
+
+/**
+ * models the necessary input for {@link ParallelArraysSGDFactorizer}
+ */
+public interface FactorizablePreferences {
+
+ LongPrimitiveIterator getUserIDs();
+
+ LongPrimitiveIterator getItemIDs();
+
+ Iterable<Preference> getPreferences();
+
+ float getMinPreference();
+
+ float getMaxPreference();
+
+ int numUsers();
+
+ int numItems();
+
+ int numPreferences();
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/KDDCupFactorizablePreferences.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/KDDCupFactorizablePreferences.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/KDDCupFactorizablePreferences.java
new file mode 100644
index 0000000..6dcef6b
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/KDDCupFactorizablePreferences.java
@@ -0,0 +1,123 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.kddcup.track1.svd;
+
+import com.google.common.base.Function;
+import com.google.common.collect.Iterables;
+import org.apache.mahout.cf.taste.example.kddcup.DataFileIterable;
+import org.apache.mahout.cf.taste.impl.common.AbstractLongPrimitiveIterator;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+import org.apache.mahout.cf.taste.model.Preference;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.common.Pair;
+
+import java.io.File;
+
+public class KDDCupFactorizablePreferences implements FactorizablePreferences {
+
+ private final File dataFile;
+
+ public KDDCupFactorizablePreferences(File dataFile) {
+ this.dataFile = dataFile;
+ }
+
+ @Override
+ public LongPrimitiveIterator getUserIDs() {
+ return new FixedSizeLongIterator(numUsers());
+ }
+
+ @Override
+ public LongPrimitiveIterator getItemIDs() {
+ return new FixedSizeLongIterator(numItems());
+ }
+
+ @Override
+ public Iterable<Preference> getPreferences() {
+ Iterable<Iterable<Preference>> prefIterators =
+ Iterables.transform(new DataFileIterable(dataFile),
+ new Function<Pair<PreferenceArray,long[]>,Iterable<Preference>>() {
+ @Override
+ public Iterable<Preference> apply(Pair<PreferenceArray,long[]> from) {
+ return from.getFirst();
+ }
+ });
+ return Iterables.concat(prefIterators);
+ }
+
+ @Override
+ public float getMinPreference() {
+ return 0;
+ }
+
+ @Override
+ public float getMaxPreference() {
+ return 100;
+ }
+
+ @Override
+ public int numUsers() {
+ return 1000990;
+ }
+
+ @Override
+ public int numItems() {
+ return 624961;
+ }
+
+ @Override
+ public int numPreferences() {
+ return 252800275;
+ }
+
+ static class FixedSizeLongIterator extends AbstractLongPrimitiveIterator {
+
+ private long currentValue;
+ private final long maximum;
+
+ FixedSizeLongIterator(long maximum) {
+ this.maximum = maximum;
+ currentValue = 0;
+ }
+
+ @Override
+ public long nextLong() {
+ return currentValue++;
+ }
+
+ @Override
+ public long peek() {
+ return currentValue;
+ }
+
+ @Override
+ public void skip(int n) {
+ currentValue += n;
+ }
+
+ @Override
+ public boolean hasNext() {
+ return currentValue < maximum;
+ }
+
+ @Override
+ public void remove() {
+ throw new UnsupportedOperationException();
+ }
+ }
+
+}
r***@apache.org
2018-06-28 14:54:56 UTC
Permalink
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/pom.xml
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/pom.xml b/community/mahout-mr/mr/pom.xml
new file mode 100644
index 0000000..0f28588
--- /dev/null
+++ b/community/mahout-mr/mr/pom.xml
@@ -0,0 +1,295 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.mahout</groupId>
+ <artifactId>mahout-mr</artifactId>
+ <version>0.14.0-SNAPSHOT</version>
+ <relativePath>../pom.xml</relativePath>
+ </parent>
+
+ <!-- modules inherit parent's group id and version. -->
+ <artifactId>mr</artifactId>
+ <name>-- Mahout Classic: Algorithms and Code</name>
+ <description>Scalable machine learning libraries</description>
+
+ <packaging>jar</packaging>
+
+ <properties>
+ <hadoop.version>2.4.1</hadoop.version>
+ <lucene.version>5.5.2</lucene.version>
+ </properties>
+ <build>
+ <resources>
+ <resource>
+ <directory>mr/src/main/resources</directory>
+ </resource>
+ <resource>
+ <directory>../src/conf</directory>
+ <includes>
+ <include>driver.classes.default.props</include>
+ </includes>
+ </resource>
+ </resources>
+ <plugins>
+ <!-- ensure licenses -->
+ <plugin>
+ <groupId>org.apache.rat</groupId>
+ <artifactId>apache-rat-plugin</artifactId>
+ </plugin>
+
+ <!-- copy jars to lib/ -->
+ <plugin>
+ <artifactId>maven-antrun-plugin</artifactId>
+ <version>1.4</version>
+ <executions>
+ <execution>
+ <id>copy</id>
+ <phase>package</phase>
+ <configuration>
+ <tasks>
+ <copy file="target/mahout-mr-${version}.jar" tofile="../../lib/mahout-mr-${version}.jar" />
+ </tasks>
+ </configuration>
+ <goals>
+ <goal>run</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+
+ <!-- delete files on mvn clean -->
+ <plugin>
+ <artifactId>maven-clean-plugin</artifactId>
+ <version>3.0.0</version>
+ <configuration>
+ <filesets>
+ <fileset>
+ <directory>../../lib/</directory>
+ <includes>
+ <include>mahout-mr_*.jar</include>
+ </includes>
+ <followSymlinks>false</followSymlinks>
+ </fileset>
+ </filesets>
+ </configuration>
+ </plugin>
+ <!-- create test jar so other modules can reuse the core test utility classes. -->
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-jar-plugin</artifactId>
+ <executions>
+ <execution>
+ <goals>
+ <goal>test-jar</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+
+ <!-- create core hadoop job jar -->
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-assembly-plugin</artifactId>
+ <executions>
+ <execution>
+ <id>job</id>
+ <phase>package</phase>
+ <goals>
+ <goal>single</goal>
+ </goals>
+ <configuration>
+ <descriptors>
+ <descriptor>src/main/assembly/job.xml</descriptor>
+ </descriptors>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+
+ <plugin>
+ <artifactId>maven-javadoc-plugin</artifactId>
+ </plugin>
+
+ <plugin>
+ <artifactId>maven-source-plugin</artifactId>
+ </plugin>
+
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-remote-resources-plugin</artifactId>
+ <configuration>
+ <appendedResourcesDirectory>src/main/resources</appendedResourcesDirectory>
+ <resourceBundles>
+ <resourceBundle>org.apache:apache-jar-resource-bundle:1.4</resourceBundle>
+ </resourceBundles>
+ <supplementalModels>
+ <supplementalModel>supplemental-models.xml</supplementalModel>
+ </supplementalModels>
+ </configuration>
+ </plugin>
+
+ <!-- remove jars from top directory on clean -->
+ <plugin>
+ <artifactId>maven-clean-plugin</artifactId>
+ <version>3.0.0</version>
+ <configuration>
+ <filesets>
+ <fileset>
+ <directory>../../lib</directory>
+ <includes>
+ <include>mahout-mr*.jar</include>
+ </includes>
+ <followSymlinks>false</followSymlinks>
+ </fileset>
+ </filesets>
+ </configuration>
+ </plugin>
+ </plugins>
+ </build>
+
+ <dependencies>
+
+
+
+ <!-- Third Party -->
+
+ <dependency>
+ <groupId>com.tdunning</groupId>
+ <artifactId>t-digest</artifactId>
+ <version>3.1</version>
+ </dependency>
+
+ <dependency>
+ <groupId>com.google.guava</groupId>
+ <artifactId>guava</artifactId>
+ <version>11.0.2</version>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.hadoop</groupId>
+ <artifactId>hadoop-client</artifactId>
+ <version>${hadoop.version}</version>
+ </dependency>
+
+ <dependency>
+ <groupId>com.fasterxml.jackson.core</groupId>
+ <artifactId>jackson-core</artifactId>
+ <version>2.7.4</version>
+ </dependency>
+
+
+
+ <dependency>
+ <groupId>org.apache.commons</groupId>
+ <artifactId>commons-lang3</artifactId>
+ <version>3.1</version>
+ </dependency>
+
+ <dependency>
+ <groupId>commons-cli</groupId>
+ <artifactId>commons-cli</artifactId>
+ <version>1.2</version>
+ </dependency>
+
+ <dependency>
+ <groupId>com.thoughtworks.xstream</groupId>
+ <artifactId>xstream</artifactId>
+ <version>1.4.4</version>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.lucene</groupId>
+ <artifactId>lucene-core</artifactId>
+ <version>${lucene.version}</version>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.lucene</groupId>
+ <artifactId>lucene-analyzers-common</artifactId>
+ <version>${lucene.version}</version>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.mahout.commons</groupId>
+ <artifactId>commons-cli</artifactId>
+ <version>2.0-mahout</version>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.commons</groupId>
+ <artifactId>commons-math3</artifactId>
+ <version>3.2</version>
+ </dependency>
+
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <version>4.12</version>
+ <scope>test</scope>
+ </dependency>
+
+ <dependency>
+ <groupId>org.hamcrest</groupId>
+ <artifactId>hamcrest-all</artifactId>
+ <version>1.3</version>
+ <scope>test</scope>
+ </dependency>
+
+ <dependency>
+ <groupId>com.carrotsearch.randomizedtesting</groupId>
+ <artifactId>randomizedtesting-runner</artifactId>
+ <version>2.0.15</version>
+ <scope>test</scope>
+ </dependency>
+
+ <dependency>
+ <groupId>org.easymock</groupId>
+ <artifactId>easymock</artifactId>
+ <version>3.2</version>
+ <scope>test</scope>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.mrunit</groupId>
+ <artifactId>mrunit</artifactId>
+ <version>1.0.0</version>
+ <classifier>hadoop2</classifier>
+ <scope>test</scope>
+ </dependency>
+
+ <dependency>
+ <groupId>commons-httpclient</groupId>
+ <artifactId>commons-httpclient</artifactId>
+ <version>3.0.1</version>
+ <scope>test</scope>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.solr</groupId>
+ <artifactId>solr-commons-csv</artifactId>
+ <version>3.5.0</version>
+ </dependency>
+
+ </dependencies>
+
+</project>

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/appended-resources/supplemental-models.xml
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/appended-resources/supplemental-models.xml b/community/mahout-mr/mr/src/appended-resources/supplemental-models.xml
new file mode 100644
index 0000000..971c72b
--- /dev/null
+++ b/community/mahout-mr/mr/src/appended-resources/supplemental-models.xml
@@ -0,0 +1,279 @@
+<supplementalDataModels>
+ <!-- missing: Maven Profile Model -->
+ <supplement>
+ <project>
+ <groupId>org.apache.maven</groupId>
+ <artifactId>maven-profile</artifactId>
+ <name>Maven Profile Model</name>
+ <licenses>
+ <license>
+ <name>The Apache Software License, Version 2.0</name>
+ <url>http://maven.apache.org/ref/2.1.0/maven-profile/license.html</url>
+ </license>
+ </licenses>
+ </project>
+ </supplement>
+ <!-- missing: Maven Project Builder -->
+ <supplement>
+ <project>
+ <groupId>org.apache.maven</groupId>
+ <artifactId>maven-project</artifactId>
+ <name>Maven Project Builder</name>
+ <licenses>
+ <license>
+ <name>The Apache Software License, Version 2.0</name>
+ <url>http://maven.apache.org/ref/2.1.0/maven-project/license.html</url>
+ </license>
+ </licenses>
+ </project>
+ </supplement>
+ <!-- missing: Maven Local Settings -->
+ <supplement>
+ <project>
+ <groupId>org.apache.maven</groupId>
+ <artifactId>maven-settings</artifactId>
+ <name>Maven Local Settings</name>
+ <licenses>
+ <license>
+ <name>The Apache Software License, Version 2.0</name>
+ <url>http://maven.apache.org/ref/2.1.0/maven-settings/license.html</url>
+ </license>
+ </licenses>
+ </project>
+ </supplement>
+ <!-- Maven Repository Metadata Model -->
+ <supplement>
+ <project>
+ <groupId>org.apache.maven</groupId>
+ <artifactId>maven-repository-metadata</artifactId>
+ <name>Maven Repository Metadata Model</name>
+ <licenses>
+ <license>
+ <name>The Apache Software License, Version 2.0</name>
+ <url>http://maven.apache.org/ref/2.1.0/maven-repository-metadata/license.html</url>
+ </license>
+ </licenses>
+ </project>
+ </supplement>
+ <!-- Maven Model -->
+ <supplement>
+ <project>
+ <groupId>org.apache.maven</groupId>
+ <artifactId>maven-model</artifactId>
+ <name>Maven Model</name>
+ <licenses>
+ <license>
+ <name>The Apache Software License, Version 2.0</name>
+ <url>http://maven.apache.org/ref/2.0.8/maven-model/license.html</url>
+ </license>
+ </licenses>
+ </project>
+ </supplement>
+ <!-- Maven Artifact -->
+ <supplement>
+ <project>
+ <groupId>org.apache.maven</groupId>
+ <artifactId>maven-artifact</artifactId>
+ <name>Maven Artifact</name>
+ <licenses>
+ <license>
+ <name>The Apache Software License, Version 2.0</name>
+ <url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
+ </license>
+ </licenses>
+ </project>
+ </supplement>
+ <!-- Maven Artifact Manager-->
+ <supplement>
+ <project>
+ <groupId>org.apache.maven</groupId>
+ <artifactId>maven-artifact-manager</artifactId>
+ <name>Maven Artifact Manager</name>
+ <licenses>
+ <license>
+ <name>The Apache Software License, Version 2.0</name>
+ <url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
+ </license>
+ </licenses>
+ </project>
+ </supplement>
+ <!-- Maven Artifact Manager-->
+ <supplement>
+ <project>
+ <groupId>org.apache.maven</groupId>
+ <artifactId>maven-plugin-api</artifactId>
+ <name>Maven Plugin API</name>
+ <licenses>
+ <license>
+ <name>The Apache Software License, Version 2.0</name>
+ <url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
+ </license>
+ </licenses>
+ </project>
+ </supplement>
+ <!-- Maven Wagon API-->
+ <supplement>
+ <project>
+ <groupId>org.apache.maven</groupId>
+ <artifactId>wagon-provider-api</artifactId>
+ <name>Maven Wagon API</name>
+ <licenses>
+ <license>
+ <name>The Apache Software License, Version 2.0</name>
+ <url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
+ </license>
+ </licenses>
+ </project>
+ </supplement>
+ <!-- Shade Maven Plugin -->
+ <supplement>
+ <project>
+ <groupId>org.codehouse.mojo</groupId>
+ <artifactId>shade-maven-plugin</artifactId>
+ <name>Shade Maven Plugin</name>
+ <licenses>
+ <license>
+ <name>UNKNOWN</name>
+ <url>UNKNOWN</url>
+ </license>
+ </licenses>
+ </project>
+ </supplement>
+ <!-- junit -->
+ <supplement>
+ <project>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <name>Junit Unit testing library</name>
+ <licenses>
+ <license>
+ <name>Common Public License - v 1.0</name>
+ <url>http://junit.sourceforge.net/cpl-v10.html</url>
+ </license>
+ </licenses>
+ </project>
+ </supplement>
+ <!-- jdom -->
+ <supplement>
+ <project>
+ <groupId>jdom</groupId>
+ <artifactId>jdom</artifactId>
+ <name>JDom</name>
+ <licenses>
+ <license>
+ <name>UNKOWN</name>
+ <url>UNKOWN</url>
+ </license>
+ </licenses>
+ </project>
+ </supplement>
+ <!-- asm -->
+ <supplement>
+ <project>
+ <groupId>asm</groupId>
+ <artifactId>asm-all</artifactId>
+ <name>ASM ALL</name>
+ <licenses>
+ <license>
+ <name>UNKOWN</name>
+ <url>http://asm.ow2.org/license.html</url>
+ </license>
+ </licenses>
+ </project>
+ </supplement>
+ <!-- Default Plexus Container -->
+ <supplement>
+ <project>
+ <groupId>org.codehaus.plexus</groupId>
+ <artifactId>plexus-container-default</artifactId>
+ <name>Default Plexus Container</name>
+ <licenses>
+ <license>
+ <name>UNKNOWN</name>
+ <url>UNKNOWN</url>
+ </license>
+ </licenses>
+ </project>
+ </supplement>
+ <!-- Classworlds -->
+ <supplement>
+ <project>
+ <groupId>org.codehouse.classworlds</groupId>
+ <artifactId>classworlds</artifactId>
+ <name>Classworlds</name>
+ <licenses>
+ <license>
+ <name></name>
+ <url>http://classworlds.codehaus.org/license.html</url>
+ </license>
+ </licenses>
+ </project>
+ </supplement>
+ <!-- Plexus Common Utilities -->
+ <supplement>
+ <project>
+ <groupId>org.codehouse.plexus</groupId>
+ <artifactId>plexus-utils</artifactId>
+ <name>Plexus Common Utilities</name>
+ <licenses>
+ <license>
+ <name>The Apache Software License, Version 2.0</name>
+ <url>http://plexus.codehaus.org/plexus-utils/license.html</url>
+ </license>
+ </licenses>
+ </project>
+ </supplement>
+ <!-- Commons Codec -->
+ <supplement>
+ <project>
+ <groupId>commons-codec</groupId>
+ <artifactId>commons-codec</artifactId>
+ <name>Commons Codec</name>
+ <url>http://commons.apache.org/codec/</url>
+ <organization>
+ <name>Apache Software Foundation</name>
+ <url>http://www.apache.org/</url>
+ </organization>
+ <licenses>
+ <license>
+ <name>The Apache Software License, Version 2.0</name>
+ <url>http://www.apache.org/licenses/LICENSE-2.0</url>
+ </license>
+ </licenses>
+ </project>
+ </supplement>
+ <!-- Commons CLI -->
+ <supplement>
+ <project>
+ <groupId>org.apache.mahout.commons</groupId>
+ <artifactId>commons-cli</artifactId>
+ <name>Commons CLI</name>
+ <url>http://commons.apache.org/cli/</url>
+ <organization>
+ <name>Apache Software Foundation</name>
+ <url>http://www.apache.org/</url>
+ </organization>
+ <licenses>
+ <license>
+ <name>The Apache Software License, Version 2.0</name>
+ <url>http://www.apache.org/licenses/LICENSE-2.0</url>
+ </license>
+ </licenses>
+ </project>
+ </supplement>
+ <!-- Xpp3 -->
+ <supplement>
+ <project>
+ <name>Xml Pull Parser 3rd Edition</name>
+ <groupId>xpp3</groupId>
+ <artifactId>xpp3_min</artifactId>
+ <url>http://www.extreme.indiana.edu/xgws/xsoap/xpp/mxp1/</url>
+ <licenses>
+ <license>
+ <name>Public Domain</name>
+ <url>http://www.xmlpull.org/</url>
+ </license>
+ </licenses>
+ </project>
+ </supplement>
+</supplementalDataModels>

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/images/logos/ mahout-powered.svg
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/images/logos/ mahout-powered.svg b/community/mahout-mr/mr/src/images/logos/ mahout-powered.svg
new file mode 100644
index 0000000..ce3ea9f
--- /dev/null
+++ b/community/mahout-mr/mr/src/images/logos/ mahout-powered.svg
@@ -0,0 +1,630 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!-- Generator: Adobe Illustrator 13.0.0, SVG Export Plug-In . SVG Version: 6.00 Build 14948) -->
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg version="1.1" id="Layer_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px"
+ width="956px" height="400px" viewBox="0 0 956 400" enable-background="new 0 0 956 400" xml:space="preserve">
+<g>
+ <path fill-rule="evenodd" clip-rule="evenodd" fill="#23A9FF" d="M709.799,389.6c-21.38,0-37.761-6.839-48.688-20.322
+ c-0.377-0.467-0.747-0.936-1.11-1.408V376c0,5.523-4.478,10.001-10.001,10.001h-28.6c-5.522,0-10-4.478-10-10.001v-64.87
+ c0-4.989-0.908-7.693-1.669-9.083c-0.053-0.096-0.104-0.194-0.154-0.292c-0.32-0.634-0.987-1.954-5.366-1.954
+ c-5.29,0-7.384,1.85-8.617,3.464c-2.353,3.069-3.593,8.255-3.593,15.005V376c0,5.523-4.477,10.001-10,10.001h-27.8
+ c-0.756,0-1.492-0.085-2.201-0.244c-0.708,0.159-1.444,0.244-2.2,0.244h-30.271c-3.453,0-6.61-1.776-8.425-4.61
+ c-0.791,0.505-1.595,0.995-2.412,1.471c-7.595,4.351-16.133,6.54-25.442,6.54c-11.384,0-21.145-3.183-29.042-9.469
+ c-1.529,3.569-5.072,6.068-9.198,6.068h-28.408c-5.523,0-10-4.478-10-10.001v-67.812c0-3.194-0.564-4.789-0.9-5.458
+ c-0.392-0.777-0.97-1.93-4.821-1.93c-4.724,0-5.983,1.728-6.896,3.675c-0.919,2.062-1.383,4.791-1.383,8.114V376
+ c0,5.523-4.477,10.001-10,10.001h-27.8c-5.523,0-10-4.478-10-10.001v-63.33c0-6.95-0.88-9.239-1.055-9.628
+ c-0.349-0.762-0.843-1.841-4.675-1.841c-5.697,0-6.798,1.676-7.151,2.329c-0.298,0.621-1.12,2.837-1.12,8.449V376
+ c0,5.523-4.477,10.001-10,10.001h-28.199c-5.523,0-10-4.478-10-10.001V269.8c0-5.522,4.477-10,10-10h26.999
+ c2.902,0,5.514,1.235,7.34,3.209c6.486-3.852,14.321-5.809,23.34-5.809c10.216,0,18.796,2.437,25.504,7.242
+ c0.185,0.133,0.368,0.272,0.545,0.418c1.322,1.092,2.566,2.262,3.73,3.506c2.438-2.188,5.07-4.048,7.884-5.571
+ c0.07-0.036,0.14-0.073,0.211-0.11c7.126-3.639,15.103-5.484,23.707-5.484c5.958,0,11.882,1.164,17.608,3.456
+ c6.131,2.448,11.667,6.673,16.449,12.554c1.573,1.945,2.946,4.052,4.116,6.313c0.941-1.602,1.974-3.131,3.103-4.586
+ C462.508,263.016,477.94,257,499.041,257c13.235,0,25.249,2.715,35.706,8.067c3.12,1.598,6.458,3.872,9.454,7.101v-39.569
+ c0-5.522,4.477-10,10-10h27.8c5.523,0,10,4.478,10,10v28.484c6.504-2.974,13.447-4.483,20.639-4.483
+ c7.865,0,15.192,1.418,21.774,4.218c7.009,3,12.832,7.627,17.329,13.761c2.014,2.758,3.63,5.599,4.846,8.499
+ c1.368-2.145,2.862-4.229,4.481-6.253c10.92-13.683,27.316-20.624,48.729-20.624c21.414,0,37.812,6.941,48.737,20.633
+ c0.225,0.278,0.444,0.562,0.665,0.843v-8.274c0-5.523,4.477-10,10-10h28.6c5.523,0,10,4.477,10,10v64.358
+ c0,6.407,0.92,8.881,1.203,9.484c0.409,0.88,1.098,2.354,5.816,2.354c6.371,0,8.746-2.222,10.299-5.57
+ c0.86-2.012,1.881-5.809,1.881-12.539v-58.088c0-5.523,4.477-10,10-10h28.201c1.719,0,3.338,0.434,4.749,1.198h2.85v-20.001
+ c0-5.522,4.478-10,10.001-10h27.6c5.522,0,10,4.478,10,10V260.6h7.198c5.523,0,10,4.477,10,10v19.602c0,5.523-4.477,10-10,10H920.4
+ v46.178c0.521,0.013,1.106,0.021,1.76,0.021c0.63,0,1.279-0.023,1.929-0.071c0.704-0.053,1.405-0.129,2.085-0.227
+ c0.475-0.068,0.952-0.103,1.427-0.103c2.388,0,4.717,0.856,6.547,2.442c2.192,1.899,3.451,4.658,3.451,7.558v20.8
+ c0,5.347-4.205,9.745-9.545,9.989l-13.179,0.602c-0.037,0.002-0.076,0.004-0.113,0.004c-1.198,0.042-2.364,0.062-3.501,0.062
+ c-14.403,0-24.539-3.26-30.987-9.963c-2.15-2.205-3.846-4.837-5.072-7.872V376c0,5.523-4.478,10.001-10,10.001H838.2
+ c-3.148,0-5.959-1.456-7.791-3.732c-2.405,1.436-4.804,2.577-7.188,3.416c-5.142,1.804-11.065,2.717-17.621,2.717
+ c-24.711,0-35.835-12.303-40.818-22.626c-0.51-1.045-0.984-2.142-1.422-3.292c-1.476,2.343-3.101,4.608-4.874,6.796
+ C747.562,382.761,731.181,389.6,709.799,389.6L709.799,389.6z M487.944,348.278c0.598,0.447,1.538,0.922,3.414,0.922
+ c4.033,0,7.665-1.15,11.099-3.517c1.935-1.333,2.882-4.174,3.318-7.126c-0.231,0.043-0.465,0.089-0.702,0.133l-6.347,1.172
+ c-6.723,1.191-9.018,2.316-9.562,2.634c-0.961,0.561-1.564,1.024-1.564,3.194C487.601,347.181,487.822,347.995,487.944,348.278
+ L487.944,348.278z M709.751,299.801c-6.414,0-9.15,2.51-10.819,4.697c-3.009,3.937-4.531,10.177-4.531,18.552
+ c0,8.386,1.529,14.651,4.544,18.623c1.671,2.205,4.405,4.728,10.807,4.728c6.375,0,9.085-2.51,10.732-4.697
+ c2.995-3.98,4.517-10.259,4.517-18.653c0-8.384-1.515-14.637-4.504-18.585C718.854,302.297,716.139,299.801,709.751,299.801
+ L709.751,299.801z M491.611,300.711c-0.264,0.336-0.562,0.826-0.854,1.529l7.135-0.875c3.8-0.479,5.996-0.97,7.181-1.304
+ c-1.357-0.335-3.556-0.662-6.974-0.662C493.944,299.399,492.062,300.24,491.611,300.711L491.611,300.711z"/>
+ <path fill="#1F1F1F" d="M582,232.6v50.641c4.02-6.2,8.67-10.52,13.96-12.971c5.28-2.449,10.851-3.67,16.681-3.67
+ c6.549,0,12.5,1.141,17.859,3.42c5.35,2.291,9.74,5.78,13.18,10.471c2.91,3.99,4.7,8.08,5.35,12.289
+ c0.65,4.201,0.971,11.07,0.971,20.601V376h-28.6v-64.87c0-5.739-0.971-10.37-2.9-13.89c-2.51-4.961-7.27-7.44-14.29-7.44
+ c-7.271,0-12.79,2.46-16.56,7.39c-3.771,4.92-5.65,11.951-5.65,21.08V376h-27.8V232.6H582 M910.4,240.6v30H927.6V290.2H910.4
+ v56.409c0,4.371,0.55,7.101,1.649,8.17c1.101,1.08,4.47,1.621,10.11,1.621c0.84,0,1.73-0.03,2.67-0.101
+ c0.939-0.069,1.859-0.17,2.77-0.3v20.8l-13.18,0.601c-1.083,0.037-2.135,0.056-3.161,0.056c-11.429,0-19.356-2.299-23.778-6.896
+ c-3.121-3.201-4.681-8.121-4.681-14.761v-65.6H868V270.6h14.8v-30H910.4 M709.8,266.2c18.3,0,31.94,5.62,40.92,16.87
+ c8.99,11.24,13.48,24.539,13.48,39.88c0,15.6-4.49,28.94-13.48,40.03c-8.979,11.08-22.62,16.619-40.92,16.619
+ s-31.94-5.539-40.92-16.619c-8.989-11.09-13.479-24.431-13.479-40.03c0-15.341,4.49-28.64,13.479-39.88
+ C677.859,271.82,691.5,266.2,709.8,266.2 M709.75,356.4c8.12,0,14.359-2.891,18.72-8.68c4.351-5.781,6.53-14.011,6.53-24.671
+ c0-10.659-2.18-18.87-6.53-24.62c-4.36-5.75-10.6-8.63-18.72-8.63c-8.13,0-14.38,2.88-18.77,8.63
+ c-4.391,5.75-6.58,13.961-6.58,24.62c0,10.66,2.189,18.89,6.58,24.671C695.37,353.51,701.62,356.4,709.75,356.4 M499.04,267
+ c11.69,0,22.069,2.32,31.149,6.971c9.07,4.639,13.61,13.369,13.61,26.18v48.76c0,3.38,0.07,7.48,0.2,12.29
+ c0.2,3.63,0.75,6.09,1.67,7.39c0.92,1.301,2.29,2.37,4.13,3.21v4.2h-30.271c-0.84-2.141-1.43-4.141-1.75-6.02
+ c-0.329-1.881-0.59-4.021-0.779-6.41c-3.859,4.17-8.311,7.72-13.34,10.65c-6.02,3.449-12.82,5.18-20.41,5.18
+ c-9.68,0-17.67-2.75-23.98-8.26c-6.31-5.5-9.47-13.301-9.47-23.4c0-13.1,5.08-22.57,15.23-28.44c5.56-3.19,13.75-5.47,24.55-6.84
+ l9.529-1.17c5.17-0.649,8.871-1.47,11.101-2.44c3.99-1.699,5.99-4.34,5.99-7.92c0-4.359-1.53-7.38-4.601-9.039
+ c-3.06-1.66-7.56-2.49-13.5-2.49c-6.66,0-11.379,1.619-14.14,4.869c-1.979,2.4-3.3,5.641-3.96,9.73h-26.8
+ c0.59-9.311,3.2-16.95,7.84-22.939C468.41,271.689,481.08,267,499.04,267 M491.359,359.2c6.07,0,11.66-1.761,16.771-5.28
+ c5.12-3.529,7.771-9.949,7.97-19.279V324.26c-1.779,1.11-3.58,2.01-5.39,2.69c-1.81,0.69-4.3,1.319-7.47,1.909l-6.33,1.17
+ c-5.93,1.051-10.189,2.32-12.77,3.82c-4.361,2.551-6.541,6.49-6.541,11.84c0,4.771,1.339,8.211,4.009,10.33
+ C484.279,358.141,487.529,359.2,491.359,359.2 M411.86,267.2c4.7,0,9.32,0.909,13.89,2.739c4.56,1.82,8.7,5.021,12.41,9.58
+ c3,3.711,5.02,8.271,6.06,13.67c0.65,3.58,0.98,8.82,0.98,15.73L445.01,376H416.6v-67.811c0-4.039-0.66-7.359-1.97-9.959
+ c-2.49-4.961-7.07-7.431-13.75-7.431c-7.73,0-13.07,3.19-16.02,9.58c-1.51,3.38-2.26,7.45-2.26,12.21V376h-27.8v-63.33
+ c0-6.311-0.65-10.9-1.95-13.76c-2.35-5.141-6.94-7.71-13.78-7.71c-7.95,0-13.29,2.569-16.02,7.71c-1.5,2.93-2.25,7.279-2.25,13.07
+ V376h-28.2V269.8h27v15.46c3.44-5.529,6.69-9.47,9.74-11.81c5.39-4.171,12.37-6.25,20.94-6.25c8.12,0,14.68,1.79,19.68,5.37
+ c4.02,3.32,7.08,7.58,9.15,12.779c3.65-6.24,8.18-10.83,13.59-13.76C398.44,268.66,404.82,267.2,411.86,267.2 M865.2,269.4V376h-27
+ v-14.96c-0.261,0.33-0.91,1.3-1.95,2.931c-1.04,1.619-2.28,3.049-3.71,4.289c-4.36,3.9-8.57,6.561-12.64,7.99
+ c-4.07,1.43-8.83,2.15-14.301,2.15c-15.74,0-26.35-5.66-31.81-16.971c-3.06-6.27-4.59-15.5-4.59-27.699V269.4h28.6v64.359
+ c0,6.07,0.71,10.641,2.14,13.711c2.53,5.42,7.49,8.129,14.881,8.129c9.47,0,15.959-3.85,19.459-11.56
+ c1.811-4.181,2.721-9.7,2.721-16.55V269.4H865.2 M582,212.6h-27.8c-11.046,0-20,8.954-20,20v21.182
+ C523.599,249.28,511.796,247,499.04,247c-20.979,0-37.309,5.431-48.668,16.161c-5.107-5.312-10.877-9.27-17.208-11.796
+ c-6.893-2.761-14.068-4.165-21.305-4.165c-10.198,0-19.703,2.213-28.252,6.576c-0.145,0.074-0.289,0.149-0.431,0.227
+ c-0.904,0.49-1.792,1.006-2.664,1.55c-8.252-5.543-18.415-8.353-30.233-8.353c-8.355,0-15.932,1.435-22.647,4.278
+ c-2.458-1.08-5.175-1.679-8.032-1.679h-27c-11.045,0-20,8.954-20,20V376c0,11.046,8.955,20,20,20h28.2
+ c7.177,0,13.472-3.781,17-9.459c3.528,5.678,9.823,9.459,17,9.459h27.8c7.177,0,13.471-3.781,17-9.459
+ c3.528,5.678,9.823,9.459,17,9.459h28.41c3.945,0,7.625-1.143,10.724-3.115c8.044,4.328,17.258,6.516,27.516,6.516
+ c9.591,0,18.534-1.975,26.644-5.875c2.891,1.591,6.19,2.475,9.636,2.475H549.8c0.743,0,1.478-0.04,2.2-0.119
+ c0.723,0.079,1.457,0.119,2.2,0.119H582c9.862,0,18.058-7.139,19.7-16.531c1.643,9.393,9.838,16.531,19.7,16.531H650
+ c6.725,0,12.675-3.318,16.3-8.408c11.611,7.979,26.173,12.008,43.5,12.008c22.084,0,39.678-6.547,52.395-19.475
+ c7.525,9.087,20.741,18.275,43.405,18.275c7.69,0,14.732-1.104,20.93-3.281c0.97-0.341,1.939-0.72,2.908-1.136
+ c2.646,1.292,5.62,2.017,8.763,2.017h27c5.679,0,10.805-2.367,14.445-6.168c7.947,5.119,18.379,7.624,31.613,7.624
+ c1.246,0,2.539-0.022,3.843-0.067c0.076-0.003,0.152-0.006,0.229-0.009l13.18-0.601c10.681-0.486,19.09-9.287,19.09-19.979V356
+ c0-5.798-2.516-11.311-6.896-15.108c-2.94-2.551-6.527-4.16-10.304-4.694v-26.191c9.72-1.362,17.199-9.711,17.199-19.806V270.6
+ c0-10.095-7.479-18.443-17.199-19.806V240.6c0-11.046-8.954-20-20-20H882.8c-11.046,0-20,8.954-20,20v8.801H837
+ c-9.677,0-17.747,6.871-19.601,16.001c-1.852-9.13-9.923-16.001-19.6-16.001h-28.6c-6.813,0-12.833,3.408-16.443,8.612
+ c-3.523-2.381-7.322-4.414-11.38-6.087c-9.217-3.799-19.841-5.726-31.577-5.726s-22.36,1.927-31.577,5.726
+ c-7.925,3.267-14.862,7.909-20.695,13.84c-5.208-6.167-11.636-10.911-19.153-14.131c-0.016-0.007-0.031-0.014-0.047-0.021
+ c-7.824-3.327-16.467-5.015-25.687-5.015c-3.604,0-7.156,0.315-10.641,0.943V232.6C602,221.554,593.046,212.6,582,212.6L582,212.6z
+ M709.75,336.4c-2.254,0-2.562-0.406-2.833-0.764c-0.598-0.787-2.517-3.982-2.517-12.587c0-8.573,1.895-11.722,2.476-12.482
+ c0.263-0.343,0.587-0.768,2.874-0.768c2.241,0,2.542,0.396,2.783,0.715c0.569,0.752,2.467,3.929,2.467,12.535
+ c0,8.638-1.922,11.862-2.511,12.645C712.255,336.006,711.958,336.4,709.75,336.4L709.75,336.4z"/>
+</g>
+<g>
+ <path fill-rule="evenodd" clip-rule="evenodd" fill="#FBE500" d="M293.5,388c-14.735,0-16.195-10.601-16.492-15.157
+ c-2.281,0.968-5.548,2.49-8.354,3.8C254.849,383.076,243.715,388,236.499,388c-25.961,0-44.166-21.61-49.72-41.423
+ c-0.496,1.275-1.103,2.539-1.847,3.778l-0.259,0.435l-0.314,0.393C176.217,361.363,147.782,362,147.5,362
+ c-13.223,0-22.925-3.37-28.833-10.014c-3.174-3.572-6.704-9.898-5.668-19.864c-0.076-13.164,4.078-39.976,7.319-50.778l1.604-5.345
+ h5.58H138.5h3.11l2.2,2.203c2.876,2.883,2.6,6.301,2.397,8.795c-0.186,2.297-0.532,6.568-0.213,15.227
+ c0.099,2.286,2.6,9.209,5.635,13.571c2.905-2.996,8.481-10.19,18.777-27.414c1.035-1.731,1.508-2.521,1.855-3.041l4.312-6.47
+ c-2.459-5.737-5.025-12.35-5.561-21.953L171,256.709V256.5c0-1.624,0.272-3.165,0.536-4.656c0.063-0.36,0.141-0.801,0.208-1.223
+ c-1.643-1.128-3.838-2.151-6.127-3.218c-2.111-0.98-4.292-1.997-6.398-3.256c-0.369-0.209-0.729-0.422-1.082-0.644
+ c0.54,1.213,0.862,2.522,0.862,3.996c0,3.947-4.782,14.335-8.793,22.354l-1.476,2.949l-3.169,0.907
+ c-4.74,1.354-14.83,1.837-22.691,1.837c-3.454,0-7.977-0.087-12.869-0.412v1.364c0,1.262,0.242,3.583,0.437,5.449
+ c0.242,2.332,0.392,3.825,0.392,5.05c0,9.626-4.898,16.854-13.795,20.355c-5.908,2.325-12.401,2.646-18.535,2.646
+ c-14.368,0-22.193-2.225-27.005-7.674c-4.93-5.588-4.942-12.66-4.958-20.851c-0.002-1.472-0.006-3.027-0.036-4.666
+ c-0.021-0.987,0.051-4.085,0.19-9.928c0.137-5.841,0.308-13.109,0.308-16.382v-21.002c-4.692-11.946-6.908-23.599-7.928-30.97
+ c-1.042-7.549,0.479-14.029,4.519-19.265c2.714-3.515,6.315-6.117,10.411-8.084v-3.68c0-4.226,0-8.548,0.348-12.964
+ c-0.274-0.091-0.552-0.181-0.833-0.272c-7.121-2.319-15.983-5.204-21.708-11.882C22.598,131.542,17,104.646,17,101.5
+ c0-9.415,5.693-15.501,14.501-15.501C40.835,85.999,46,94.573,46,100.5c0,2.351-0.814,5.752-2.543,12.424
+ c-0.538,2.081-1.261,4.873-1.453,5.927c0.13,5.004,3.026,8.388,5.463,10.36c3.112,2.516,7.279,4.158,11.751,4.679
+ C76.873,88.335,129.009,72,169.499,72c50.34,0,81.615,26.567,86.227,73.024C271.345,139.479,288.758,134,302.5,134
+ c10.265,0,22.501,4.945,22.501,28.5c0,26.976-14.824,65.562-47.938,90.953l-5.501,4.217l-4.637-5.153
+ c-6.05-6.723-13.757-10.396-24.253-11.562l-1.746-0.194c0.875,3.851,2.273,7.381,3.798,11.227
+ c1.421,3.591,2.943,7.431,4.067,11.781l0.006-0.036L259.498,278c6.913,9.213,14.501,33.549,14.501,46.5
+ c0,0.404-0.011,0.826-0.036,1.263c3.446-4.232,8.916-6.763,15.537-6.763c13.398,0,19.501,8.553,19.501,16.501
+ c0,3.262-1.63,6.604-4.312,11.722c-0.3,0.573-0.668,1.277-1.004,1.936c0.398,0.487,0.848,1.01,1.231,1.457
+ c3.22,3.751,8.084,9.422,8.084,16.884C313.001,379.377,304.8,388,293.5,388L293.5,388z M246.439,356.083
+ c-0.28,0.348-0.395,0.733-0.437,1.229C246.153,356.929,246.298,356.518,246.439,356.083L246.439,356.083z M270.056,335.941
+ c-1.21,1.355-2.773,2.583-4.78,3.574c1.535-0.104,3.14-0.207,4.789-0.296c-0.04-0.548-0.065-1.123-0.065-1.721
+ C270,336.973,270.019,336.451,270.056,335.941L270.056,335.941z M219.021,317.979c0.093,0.007,0.194,0.013,0.302,0.018
+ c0.586-0.089,1.986-0.42,2.938-0.646c0.477-0.114,0.957-0.226,1.438-0.338c-1.721,0.032-3.758,0.146-4.62,0.547
+ C219.059,317.655,219.036,317.792,219.021,317.979L219.021,317.979z M172.531,125.258c8.011,5.611,15.058,13.592,20.572,20.675
+ c2.554-14.033,4.928-23.67,8.842-29.011c-5.7,1.628-9.894,5.061-12.692,7.353c-2.444,1.999-4.553,3.726-7.753,3.726
+ c-2.045,0-3.8-0.7-6.71-1.858C174.111,125.874,173.352,125.572,172.531,125.258L172.531,125.258z"/>
+ <path fill="#1F1F1F" d="M169.5,79.5c36,0,75,15,79,69h-3c-5-28-16-40-37-40c-16,0-25,12-27,12s-12.5-6-23-6c-21,0-43,12-42,42
+ l-55,11c0-6,0-12,1-18c-7-3-19-5-25-12c-7.5-8.83-13-34-13-36c0-6,3-8,7-8c5,0,7,5,7,7c0,3-4,16-4,18
+ c0,13.355,12.737,23.069,27.8,23.069c0.728,0,1.463-0.023,2.2-0.069C79.5,93.5,134.5,79.5,169.5,79.5 M213.538,119.277
+ c18.366,0.001,22.213,25.926,26.962,39.223c17-6,44-17,62-17c13,0,15,11,15,21c0,26-15,62-45,85c-9-10-20-13-29-14
+ c8.5-20.5,10.83-49,1-49c-6,0-3,11-4,14c-2,11-8,32-8,34c0,18,10.5,26.5,10.5,44.5c0,3-4.5,22.5-7.5,33.5c-3-1-8-1-10-1
+ c-6,0-14,0-14,9c0,6,5,7,8,7c2,0,8-2,11-2c2,0,6,1,6,5c0,10-20,4-20,16c0,6,3,7,6,7c2,0,18.01-9.73,21-10
+ c0.204-0.019,0.396-0.027,0.579-0.027c4.739,0,2.421,6.027,2.421,6.027c-8.83,3.5-8,9-8,12c0,5,3,8,6,8c10,0,11-19,11-20
+ c6,0,14-1,22-1c1-3,0-6,0-9c0-8,6-11,12-11c8,0,12,4,12,9c0,3-6,12-6,14c0,4,10,10,10,18s-5,13-12,13c-16,0-3-16-15-16
+ c-4,0-32,16-42,16c-27,0-44-28-44-46v-22c-1-7-2-16-4-23c-3-12-9.17-18.17-10-33c0-3,2-8,0-10c-4-4-10.5-5.83-15.5-8.83
+ c-9-5-11.5-16.17-13.5-21.17c-1-4-3-7-6-11c-7,3-6,9-6,13c0,18,14,25,14,29c0,2-5,13-8,19c-3.04,0.868-11.171,1.549-20.627,1.549
+ c-12.319,0-26.887-1.154-35.373-4.549c-29-10-38.26-46.189-41-66C43.67,177,65.83,174.17,84,172c12.6-1.5,31.5-4.5,45.5-6.5
+ c0,0,1,0,1-2c0-3-2-11-2-13v-6c0-10,12.5-19,24-19c20.17,0,40,33,45,39c3.5-20.17,6.83-43.83,13-45
+ C211.555,119.349,212.566,119.277,213.538,119.277 M54.5,250.5c10.601,13.491,30.487,26.054,46.237,26.054
+ c0.594,0,1.182-0.018,1.763-0.054c0,3,0.83,8.5,0.83,10.5c0,15-15.83,15.5-24.83,15.5c-27,0-24.17-8.17-24.5-25.83
+ C53.96,274.67,54.5,256.5,54.5,250.5 M253.5,282.5c6,8,13,31,13,42c0,8-6,10-14,10c-7,0-7-9-7-13
+ C245.5,318.5,251.5,295.5,253.5,282.5 M138.5,283.5c1,1-0.59,3.01,0,19c0.17,4.5,4.83,17.17,11,22
+ c0.394,0.31,0.843,0.454,1.342,0.454c7.473,0,25.783-32.642,27.658-35.454l3,41c0,5,0,11-3,16c-4,5-22,8-31,8c-15,0-29-5-27-22
+ c-0.17-12.17,4-39,7-49H138.5 M169.5,64.5c-22.887,0-47.102,5.267-66.436,14.451c-22.318,10.602-38.762,26.385-48.174,46.081
+ c-2.892-1.323-4.917-3.379-5.317-5.69c0.286-1.215,0.786-3.146,1.146-4.539c1.934-7.468,2.781-11.077,2.781-14.302
+ c0-10.625-8.84-22-22-22c-12.953,0-22,9.458-22,23c0,5.403,4.153,19.196,4.33,19.781c3.642,12.041,7.645,20.522,12.238,25.93
+ l0.022,0.026l0.022,0.025c5.736,6.693,13.632,10.188,20.458,12.587c-0.062,2.329-0.068,4.619-0.069,6.88
+ c-3.33,2.099-6.335,4.699-8.847,7.953c-3.655,4.736-7.666,12.895-6.012,24.87c1.152,8.332,3.418,19.828,7.859,31.554V250.5
+ c0,3.184-0.17,10.403-0.307,16.204c-0.159,6.711-0.212,9.158-0.19,10.267c0.029,1.535,0.031,3.051,0.034,4.517
+ c0.015,8.896,0.031,18.094,6.835,25.802C53.794,316.263,66.235,317.5,78.5,317.5c6.544,0,14.191-0.376,21.283-3.167
+ c2.781-1.094,5.281-2.484,7.479-4.137c-1.056,8.09-1.759,15.938-1.766,21.561c-1.177,12.445,3.43,20.561,7.567,25.214
+ c7.394,8.313,18.98,12.529,34.438,12.529c5.904,0,13.821-0.954,20.661-2.489c6.875-1.543,12.2-3.518,16.228-6.052
+ c2.301,4.51,5.13,8.851,8.412,12.832C204.34,387.79,219.86,395.5,236.5,395.5c8.772,0,20.174-4.999,35.323-12.061
+ c0.02-0.009,0.04-0.019,0.06-0.028c0.447,0.926,0.981,1.858,1.621,2.783c2.932,4.245,8.782,9.306,19.996,9.306
+ c7.6,0,14.536-2.912,19.53-8.201c4.817-5.1,7.47-12.132,7.47-19.799c0-8.513-4.28-14.937-7.848-19.338
+ c2.113-4.158,3.848-8.218,3.848-12.662c0-11.927-9.274-24-27-24c-3.298,0-6.405,0.485-9.255,1.394
+ c-2.486-13.581-8.349-30.866-14.745-39.394l-9.87-13.16c-0.968-3.413-2.118-6.49-3.218-9.299c3.468,1.514,6.374,3.645,8.938,6.493
+ l9.274,10.305l11.002-8.435C316.77,232.461,332.5,191.32,332.5,162.5c0-5.601-0.454-13.9-4.378-21.287
+ c-5.04-9.488-14.14-14.713-25.622-14.713c-12.295,0-26.812,3.88-40.602,8.463c-1.801-9.966-4.853-19.031-9.12-27.063
+ c-5.635-10.608-13.4-19.48-23.079-26.371C214.048,70.389,193.232,64.5,169.5,64.5L169.5,64.5z M153.054,279.371l0.912-0.261
+ l2.951-5.902c1.771-3.542,3.868-8.042,5.472-11.744c0.449-1.035,0.853-1.989,1.216-2.875c0.6,8.093,2.501,14.303,4.513,19.443
+ l-2.098,3.147c-0.447,0.67-0.922,1.462-2.05,3.349c-4.393,7.349-7.831,12.719-10.507,16.642c-0.255-7.688,0.052-11.492,0.22-13.565
+ C153.833,285.754,154.081,282.688,153.054,279.371L153.054,279.371z"/>
+</g>
+<g>
+ <path fill-rule="evenodd" clip-rule="evenodd" fill="#23A9FF" d="M445.01,377.502H416.6c-0.828,0-1.501-0.673-1.501-1.501v-67.812
+ c0-3.775-0.607-6.899-1.808-9.283c-2.233-4.446-6.292-6.605-12.412-6.605c-7.158,0-11.952,2.849-14.657,8.708
+ c-1.406,3.146-2.121,7.051-2.121,11.583v63.41c0,0.828-0.673,1.501-1.501,1.501h-27.8c-0.828,0-1.501-0.673-1.501-1.501v-63.33
+ c0-6.069-0.609-10.49-1.816-13.142c-2.1-4.593-6.162-6.828-12.414-6.828c-7.419,0-12.225,2.26-14.695,6.912
+ c-1.373,2.681-2.073,6.848-2.073,12.368v64.02c0,0.828-0.673,1.501-1.501,1.501h-28.202c-0.828,0-1.501-0.673-1.501-1.501V269.8
+ c0-0.828,0.673-1.501,1.501-1.501h27.001c0.828,0,1.501,0.673,1.501,1.501v10.492c2.533-3.545,4.988-6.237,7.326-8.03
+ c5.624-4.353,12.977-6.562,21.853-6.562c8.402,0,15.317,1.902,20.551,5.65c0.03,0.02,0.057,0.04,0.082,0.063
+ c3.509,2.895,6.334,6.504,8.422,10.749c3.508-5.25,7.753-9.242,12.649-11.891c5.95-3.04,12.626-4.572,19.875-4.572
+ c4.873,0,9.735,0.959,14.446,2.849c4.774,1.902,9.153,5.276,13.018,10.025c3.147,3.89,5.287,8.71,6.37,14.331
+ c0.668,3.688,1.007,9.069,1.007,16.015l-0.189,67.085C446.507,376.831,445.836,377.502,445.01,377.502L445.01,377.502z"/>
+ <path fill="#1F1F1F" d="M411.86,267.2c4.7,0,9.32,0.909,13.89,2.739c4.56,1.82,8.7,5.021,12.41,9.58c3,3.711,5.02,8.271,6.06,13.67
+ c0.65,3.58,0.98,8.82,0.98,15.73L445.01,376H416.6v-67.811c0-4.039-0.66-7.359-1.97-9.959c-2.49-4.961-7.07-7.431-13.75-7.431
+ c-7.73,0-13.07,3.19-16.02,9.58c-1.51,3.38-2.26,7.45-2.26,12.21V376h-27.8v-63.33c0-6.311-0.65-10.9-1.95-13.76
+ c-2.35-5.141-6.94-7.71-13.78-7.71c-7.95,0-13.29,2.569-16.02,7.71c-1.5,2.93-2.25,7.279-2.25,13.07V376h-28.2V269.8h27v15.46
+ c3.44-5.529,6.69-9.47,9.74-11.81c5.39-4.171,12.37-6.25,20.94-6.25c8.12,0,14.68,1.79,19.68,5.37c4.02,3.32,7.08,7.58,9.15,12.779
+ c3.65-6.24,8.18-10.83,13.59-13.76C398.44,268.66,404.82,267.2,411.86,267.2 M411.86,264.2c-7.485,0-14.391,1.587-20.523,4.718
+ c-0.022,0.011-0.043,0.022-0.065,0.034c-4.465,2.418-8.405,5.893-11.758,10.363c-2.029-3.501-4.587-6.534-7.643-9.058
+ c-0.053-0.045-0.108-0.087-0.164-0.127c-5.497-3.936-12.706-5.931-21.427-5.931c-9.215,0-16.878,2.313-22.776,6.877
+ c-1.614,1.238-3.242,2.832-4.904,4.808V269.8c0-1.657-1.343-3-3-3h-27c-1.657,0-3,1.343-3,3V376c0,1.657,1.343,3,3,3h28.2
+ c1.657,0,3-1.343,3-3v-64.02c0-5.276,0.646-9.214,1.92-11.703c2.165-4.076,6.539-6.077,13.35-6.077
+ c5.682,0,9.194,1.893,11.052,5.957c0.764,1.682,1.678,5.222,1.678,12.513V376c0,1.657,1.343,3,3,3h27.8c1.657,0,3-1.343,3-3v-63.41
+ c0-4.321,0.672-8.018,1.999-10.986c2.453-5.313,6.678-7.804,13.281-7.804c5.574,0,9.091,1.835,11.069,5.776
+ c1.097,2.176,1.651,5.072,1.651,8.613V376c0,1.657,1.343,3,3,3h28.41c1.653,0,2.996-1.338,3-2.991l0.19-67.08
+ c0-7.044-0.346-12.517-1.028-16.275c-1.136-5.897-3.381-10.94-6.679-15.02c-4.031-4.955-8.615-8.479-13.631-10.48
+ C421.97,265.194,416.922,264.2,411.86,264.2L411.86,264.2z"/>
+</g>
+<g>
+ <g>
+ <path fill-rule="evenodd" clip-rule="evenodd" fill="#23A9FF" d="M170,62c10.33,0,14-3.67,28.67-13
+ c22.66,14.21-2.84,34.11,28.33,54c-7.33-1.65-15.33-4.33-21,1.5c-8.5,0-8.83-6.97-14.5-15.5c-8.5,5.68,29.5,34.67-22.67,42.26
+ c-28.03,4.09-8.5-17.05-36.83-34.1c0,0-2.83,0-5.67,2.84c2.84,2.84,15.17,12,15.17,12c-15-3.67-25.67-2.89-28.5,17
+ c-2.83-5.68-5.67-12.04-5.67-20.56c0-14.21,6.67-59.11,29.34-59.11C142.33,49.33,159.67,62,170,62z"/>
+ <path fill="none" stroke="#000000" stroke-width="3" stroke-linecap="round" stroke-linejoin="round" d="M170,62
+ c10.33,0,14-3.67,28.67-13c22.66,14.21-2.84,34.11,28.33,54c-7.33-1.65-15.33-4.33-21,1.5c-8.5,0-8.83-6.97-14.5-15.5
+ c-8.5,5.68,29.5,34.67-22.67,42.26c-28.03,4.09-8.5-17.05-36.83-34.1c0,0-2.83,0-5.67,2.84c2.84,2.84,15.17,12,15.17,12
+ c-15-3.67-25.67-2.89-28.5,17c-2.83-5.68-5.67-12.04-5.67-20.56c0-14.21,6.67-59.11,29.34-59.11C142.33,49.33,159.67,62,170,62z"
+ />
+ </g>
+ <defs>
+ <filter id="Adobe_OpacityMaskFilter" filterUnits="userSpaceOnUse" x="105.83" y="47.5" width="122.67" height="85.774">
+
+ <feColorMatrix type="matrix" values="-1 0 0 0 1 0 -1 0 0 1 0 0 -1 0 1 0 0 0 1 0" color-interpolation-filters="sRGB" result="source"/>
+ </filter>
+ </defs>
+ <mask maskUnits="userSpaceOnUse" x="105.83" y="47.5" width="122.67" height="85.774" id="SVGID_1_">
+ <g filter="url(#Adobe_OpacityMaskFilter)">
+
+ <image overflow="visible" width="128" height="91" xlink:href="data:image/jpeg;base64,/9j/4AAQSkZJRgABAgEASABIAAD/7AARRHVja3kAAQAEAAAAHgAA/+4AIUFkb2JlAGTAAAAAAQMA
+EAMCAwYAAAItAAADjQAABP//2wCEABALCwsMCxAMDBAXDw0PFxsUEBAUGx8XFxcXFx8eFxoaGhoX
+Hh4jJSclIx4vLzMzLy9AQEBAQEBAQEBAQEBAQEABEQ8PERMRFRISFRQRFBEUGhQWFhQaJhoaHBoa
+JjAjHh4eHiMwKy4nJycuKzU1MDA1NUBAP0BAQEBAQEBAQEBAQP/CABEIAFsAgAMBIgACEQEDEQH/
+xACNAAEAAgMBAQAAAAAAAAAAAAAABQcBBAYCAwEBAAAAAAAAAAAAAAAAAAAAABAAAgICAQQCAwEB
+AAAAAAAAAwQBAgUGABAgERMwElAxFEAWEQABAwIEBAUEAwAAAAAAAAABABECIQMgMUESEFFhIjBx
+gTIEQJGhQlJiFBIBAAAAAAAAAAAAAAAAAAAAUP/aAAwDAQACEQMRAAAAr8GZad70qyHvKHKfdZzp
+qvewam91PYlQa1oVofICXiLCOv38ZGMj56MkITakR49hqVDclRECD6XBVlxm4AAAA8/M91ZavGlZ
+M4J+26rtU9cl0VaFjyNMWmSrGQDU4GxqyO7ia/1Dai/WCc7ist024jWHrrOR2y8fpEypljyZr7qq
+1IIAD15AAHV9PVosuF44b+gAAH//2gAIAQIAAQUA/If/2gAIAQMAAQUA/If/2gAIAQEAAQUA6Vra
+8p646zB9UdHVhRha3apiGmYcQOpbsiJmdX1z7wrjABpdIF4yWtLM1yulmFLGNdXn0m4tjHWbYXTJ
+mVsCAQ9hwI7hZBZc/XXcf/a5i0qLg6kCMkHwqpuf80n5BhVQ8oKlI5kBQRfZQ1Fkeuk42KirERHw
+sR5Dt8eMl0WH7T60rAVfiJHmm8LTRnpgQ+7JYwfrW+C1orA2wFn983LGwwC1ZpbmoBm761fqEl4H
+RzeFV3sdmAOVifPbkq2sshkzY3Jr5gVxZnJAJTKgHcn65pcxDILR6n2xUFsaYTFw+aYxjGGyg3Qd
+haxYe5qSIwNgbENjItsW9pOTMzzVmKhZYz1FlsptbbNyZBonLEtfml5a4yhJBB9bT4ru9qyLsRPI
+D5R+5R9cWzKzuEdqZfpctKRk80EI9izH9pe215t2RMxOC2iFqj3FX6s7utTju72vDuYccn/L/9oA
+CAECAgY/AEP/2gAIAQMCBj8AQ//aAAgBAQEGPwDgIxBJOQCEiNoK3Rr5hbb0DHrpi3CJjHRNcHbz
+wgDM5KN67F5SqgNoTGIR7AXRn8an9dE1y1KmoDr2S+xQFu0WOpDKNz5A3S6oR2gKXbop2pfqfxgB
+IeMD+VFg1MDSDqsQvYFSITRDcJPyUm/bP0wRuSFZVKAGnhS8l6Hjbt/ykAoUZh4ch0UbrasTxthn
+EaqI6eDukWATQkCeE2FRUIxkGILHgZaBgojojM6I/FJ7oljyHqgYyBfFIRzZXPjXpkwlIygZF8zU
+VKBJGSkDII3LWevCXmFGuilEkKV22wm+aEZyJtPXookF3GGQ6IfIt0lAu4Ww16omdwsdAm3FVUnN
+XBW4yZgpRslov7iu+bruX+acssn5ISGuAkqbYRJ2BoULYNDngt3HYOx9VGunF5FSAkEbcC4epxVw
+OMwo27p2kc1W4PumFwP5oi05KO+TROg+m//Z" transform="matrix(1 0 0 1 103 45)">
+ </image>
+ </g>
+ </mask>
+ <g mask="url(#SVGID_1_)">
+ <path fill-rule="evenodd" clip-rule="evenodd" fill="#043C96" d="M170,62c10.33,0,14-3.67,28.67-13
+ c22.66,14.21-2.84,34.11,28.33,54c-7.33-1.65-15.33-4.33-21,1.5c-8.5,0-8.83-6.97-14.5-15.5c-8.5,5.68,29.5,34.67-22.67,42.26
+ c-28.03,4.09-8.5-17.05-36.83-34.1c0,0-2.83,0-5.67,2.84c2.84,2.84,15.17,12,15.17,12c-15-3.67-25.67-2.89-28.5,17
+ c-2.83-5.68-5.67-12.04-5.67-20.56c0-14.21,6.67-59.11,29.34-59.11C142.33,49.33,159.67,62,170,62z"/>
+ <path fill="none" stroke="#043C96" stroke-width="3" stroke-linecap="round" stroke-linejoin="round" d="M170,62
+ c10.33,0,14-3.67,28.67-13c22.66,14.21-2.84,34.11,28.33,54c-7.33-1.65-15.33-4.33-21,1.5c-8.5,0-8.83-6.97-14.5-15.5
+ c-8.5,5.68,29.5,34.67-22.67,42.26c-28.03,4.09-8.5-17.05-36.83-34.1c0,0-2.83,0-5.67,2.84c2.84,2.84,15.17,12,15.17,12
+ c-15-3.67-25.67-2.89-28.5,17c-2.83-5.68-5.67-12.04-5.67-20.56c0-14.21,6.67-59.11,29.34-59.11C142.33,49.33,159.67,62,170,62z"
+ />
+ </g>
+</g>
+<g>
+ <path fill-rule="evenodd" clip-rule="evenodd" fill="#FBE500" d="M293.5,382c-9.998,0-10.315-5.942-10.546-10.279
+ c-0.217-4.07-0.465-5.721-4.453-5.721c-1.218,0-7.149,2.766-12.382,5.203C255.8,376.014,242.957,382,236.5,382
+ c-12.534,0-24.353-5.965-33.282-16.796C195.682,356.062,191,344.297,191,334.499v-21.89c-0.17-1.201-0.341-2.459-0.518-3.752
+ c-0.845-6.225-1.805-13.276-3.424-18.945c-1.138-4.55-2.757-8.294-4.324-11.914c-2.56-5.912-5.206-12.029-5.732-21.414
+ c-0.002-1.18,0.212-2.402,0.442-3.695c0.355-2.016,0.799-4.522-0.004-5.328c-2.376-2.377-5.892-4.014-9.292-5.598
+ c-1.994-0.93-4.056-1.889-5.919-3.005c-8.018-4.455-11.089-13.294-13.123-19.146c-0.37-1.066-0.69-1.987-0.997-2.755l-0.038-0.095
+ l-0.025-0.1c-0.816-3.267-2.352-5.857-5.008-9.474c-4.247,2.344-4.152,6.092-4.06,9.727c0.013,0.481,0.023,0.944,0.023,1.384
+ c0,11.657,6.152,18.462,10.225,22.965c2.191,2.423,3.775,4.175,3.775,6.034c0,3.166-8.077,19.509-8.159,19.671l-0.296,0.592
+ l-0.633,0.181c-3.363,0.961-11.819,1.606-21.042,1.606c-7.303,0-25.421-0.454-35.926-4.656
+ c-30.922-10.66-39.625-50.538-41.929-67.187c-0.814-5.892,0.305-10.864,3.325-14.776c6.96-9.015,22.775-10.902,35.482-12.418
+ c8.487-1.01,19.755-2.69,30.65-4.316c5.071-0.757,10.019-1.493,14.48-2.133c0.025-0.116,0.048-0.296,0.048-0.562
+ c0-1.51-0.598-4.632-1.125-7.385c-0.542-2.835-0.875-4.625-0.875-5.616v-6.001c0-11.356,13.95-20.5,25.5-20.5
+ c17.761,0,34.676,23.646,42.804,35.009c0.467,0.654,0.904,1.262,1.304,1.819c0.164-0.953,0.326-1.91,0.488-2.869
+ c4.085-24.071,7.006-38.771,13.125-39.933c1.174-0.168,2.268-0.248,3.317-0.248c16.308,0,21.873,18.76,25.937,32.459
+ c0.671,2.254,1.311,4.413,1.952,6.341c2.131-0.759,4.403-1.588,6.779-2.457C264.544,148.163,286.92,140,302.5,140
+ c16.501,0,16.501,16.934,16.501,22.5c0,25.503-14.097,62.045-45.589,86.19l-1.1,0.843l-0.928-1.03
+ c-6.994-7.771-16.168-12.191-28.05-13.513l-1.984-0.221l0.764-1.845c7.093-17.106,9.554-38.674,5.162-45.25
+ c-0.763-1.145-1.647-1.677-2.776-1.677c-0.789,0-1.146,0.278-1.346,0.486c-1.222,1.269-1.085,4.924-0.984,7.593
+ c0.074,1.938,0.139,3.62-0.208,4.779c-1.132,6.178-3.464,15.332-5.345,22.691c-1.271,4.979-2.585,10.13-2.617,10.963
+ c0,8.704,2.499,15.01,5.145,21.688c2.633,6.646,5.355,13.515,5.355,22.801c0,3.303-4.705,23.461-7.551,33.896l-0.417,1.529
+ l-1.504-0.501C232.255,311,227.348,311,225.499,311c-7.319,0-12.5,0.539-12.5,7.499c0,4.545,3.536,5.5,6.501,5.5
+ c0.724,0,2.461-0.41,4.142-0.808c2.474-0.585,5.031-1.19,6.857-1.19c3.014,0,7.5,1.731,7.5,6.5c0,5.946-5.555,7.321-10.456,8.535
+ c-5.938,1.47-9.543,2.707-9.543,7.465c0,5.075,2.224,5.5,4.5,5.5c0.845-0.146,5.368-2.56,8.67-4.322
+ c6.417-3.424,10.441-5.515,12.195-5.673c0.25-0.022,0.488-0.033,0.711-0.033c2.091,0,3.172,0.936,3.71,1.721
+ c1.59,2.315,0.269,5.939,0.114,6.346l-0.238,0.614l-0.61,0.241c-7.2,2.854-7.12,6.903-7.063,9.859
+ c0.006,0.263,0.011,0.511,0.011,0.746c0,4.068,2.289,6.5,4.499,6.5c8.643,0,9.501-18.314,9.501-18.5v-1.499h1.5
+ c2.734,0,5.946-0.217,9.348-0.444c3.719-0.248,7.553-0.507,11.48-0.551c0.231-1.382,0.072-2.827-0.097-4.339
+ c-0.113-1.024-0.231-2.083-0.231-3.166c0-9.228,7.274-12.5,13.502-12.5c9.963,0,13.5,5.655,13.5,10.5
+ c0,1.88-1.435,4.758-3.625,8.935c-0.976,1.864-2.313,4.413-2.376,5.091c0,1.074,1.71,3.068,3.363,4.997
+ c2.957,3.445,6.636,7.734,6.636,12.976C306.999,376.174,301.574,382,293.5,382L293.5,382z"/>
+ <g>
+ <path fill="#1F1F1F" d="M213.538,119.277c18.366,0.001,22.213,25.926,26.962,39.223c17-6,44-17,62-17c13,0,15,11,15,21
+ c0,26-15,62-45,85c-9-10-20-13-29-14c8.5-20.5,10.83-49,1-49c-6,0-3,11-4,14c-2,11-8,32-8,34c0,18,10.5,26.5,10.5,44.5
+ c0,3-4.5,22.5-7.5,33.5c-3-1-8-1-10-1c-6,0-14,0-14,9c0,6,5,7,8,7c2,0,8-2,11-2c2,0,6,1,6,5c0,10-20,4-20,16c0,6,3,7,6,7
+ c2,0,18.01-9.73,21-10c0.204-0.019,0.396-0.027,0.579-0.027c4.739,0,2.421,6.027,2.421,6.027c-8.83,3.5-8,9-8,12c0,5,3,8,6,8
+ c10,0,11-19,11-20c6,0,14-1,22-1c1-3,0-6,0-9c0-8,6-11,12-11c8,0,12,4,12,9c0,3-6,12-6,14c0,4,10,10,10,18s-5,13-12,13
+ c-16,0-3-16-15-16c-4,0-32,16-42,16c-27,0-44-28-44-46v-22c-1-7-2-16-4-23c-3-12-9.17-18.17-10-33c0-3,2-8,0-10
+ c-4-4-10.5-5.83-15.5-8.83c-9-5-11.5-16.17-13.5-21.17c-1-4-3-7-6-11c-7,3-6,9-6,13c0,18,14,25,14,29c0,2-5,13-8,19
+ c-3.04,0.868-11.171,1.549-20.627,1.549c-12.319,0-26.887-1.154-35.373-4.549c-29-10-38.26-46.189-41-66
+ C43.67,177,65.83,174.17,84,172c12.6-1.5,31.5-4.5,45.5-6.5c0,0,1,0,1-2c0-3-2-11-2-13v-6c0-10,12.5-19,24-19c20.17,0,40,33,45,39
+ c3.5-20.17,6.83-43.83,13-45C211.555,119.349,212.566,119.277,213.538,119.277 M213.538,116.277L213.538,116.277
+ c-1.121,0-2.285,0.085-3.462,0.253l-0.067,0.009l-0.067,0.013c-7.154,1.356-10.092,16.252-14.208,40.478
+ c-8.547-11.923-25.273-34.53-43.232-34.53c-6.25,0-12.861,2.322-18.139,6.37c-5.631,4.32-8.861,10.017-8.861,15.63v6
+ c0,1.128,0.326,2.887,0.902,5.898c0.415,2.168,0.916,4.785,1.058,6.364c-4.108,0.593-8.54,1.254-13.201,1.949
+ c-10.889,1.624-22.148,3.302-30.614,4.31c-12.988,1.551-29.15,3.481-36.493,12.993c-3.275,4.243-4.495,9.591-3.625,15.896
+ c1.349,9.753,4.34,24.19,10.932,37.593c7.76,15.777,18.523,26.143,31.994,30.81c10.756,4.273,29.043,4.736,36.418,4.736
+ c9.348,0,17.968-0.669,21.452-1.664l1.269-0.362l0.59-1.181c0.34-0.68,8.317-16.676,8.317-20.342c0-2.437-1.747-4.369-4.165-7.043
+ c-3.916-4.332-9.835-10.879-9.835-21.957c0-0.452-0.012-0.929-0.024-1.423c-0.087-3.454,0.041-5.904,2.188-7.644
+ c2.064,2.912,3.25,5.088,3.926,7.794l0.05,0.197l0.075,0.189c0.294,0.734,0.609,1.641,0.973,2.689
+ c1.976,5.687,5.281,15.197,13.81,19.963c1.919,1.147,4.002,2.118,6.018,3.057c3.399,1.584,6.611,3.08,8.799,5.234
+ c0.252,0.677-0.136,2.876-0.347,4.069c-0.23,1.3-0.467,2.645-0.467,3.873v0.084l0.005,0.084c0.54,9.651,3.24,15.891,5.851,21.924
+ c1.614,3.729,3.138,7.252,4.234,11.636l0.012,0.049l0.014,0.048c1.589,5.56,2.54,12.55,3.378,18.716
+ c0.172,1.267,0.34,2.497,0.507,3.673V334.5c0,10.129,4.813,22.26,12.56,31.658c9.218,11.183,21.45,17.342,34.44,17.342
+ c6.791,0,19.8-6.064,30.254-10.938c4.641-2.163,10.408-4.851,11.819-5.062c2.478,0.006,2.669,0.32,2.882,4.301
+ c0.219,4.089,0.626,11.699,12.044,11.699c8.832,0,15-6.579,15-16c0-5.797-3.88-10.319-6.997-13.953
+ c-1.082-1.262-2.686-3.131-2.97-3.964c0.292-0.864,1.411-2.999,2.171-4.449c2.362-4.507,3.796-7.404,3.796-9.634
+ c0-5.973-4.638-12-15-12c-9.112,0-15,5.495-15,14c0,1.166,0.123,2.267,0.241,3.331c0.107,0.968,0.207,1.864,0.204,2.7
+ c-3.537,0.083-7.038,0.317-10.199,0.529c-3.374,0.226-6.562,0.439-9.246,0.439h-2.961l-0.039,2.989
+ c-0.035,2.644-1.656,17.011-8,17.011c-1.21,0-3-1.589-3-5c0-0.244-0.005-0.503-0.01-0.775c-0.057-2.933-0.117-5.966,6.116-8.436
+ l1.223-0.484l0.472-1.228c0.302-0.785,1.707-4.846-0.276-7.733c-0.608-0.886-2.06-2.371-4.945-2.371
+ c-0.274,0-0.561,0.014-0.851,0.04c-1.974,0.178-5.405,1.917-12.763,5.842c-2.98,1.59-7.018,3.744-8.235,4.145
+ c-1.546-0.011-2.731-0.216-2.731-3.999c0-3.57,2.432-4.528,8.404-6.008c4.894-1.212,11.596-2.872,11.596-9.992
+ c0-5.252-4.527-8-9-8c-2.002,0-4.647,0.626-7.205,1.231c-1.293,0.307-3.246,0.769-3.795,0.769c-5,0-5-2.906-5-4
+ c0-5.094,2.882-6,11-6c1.611,0,6.513,0,9.051,0.846l3.009,1.003l0.834-3.06C240.998,301.743,246,280.698,246,277
+ c0-9.572-2.776-16.579-5.461-23.355c-2.583-6.521-5.024-12.68-5.039-21.068c0.119-1.052,1.42-6.151,2.57-10.657
+ c1.876-7.352,4.206-16.483,5.351-22.711c0.392-1.379,0.328-3.073,0.248-5.188c-0.054-1.437-0.219-5.81,0.57-6.5c0,0,0,0,0.001,0
+ c0.011,0,0.1-0.021,0.261-0.021c0.299,0,0.854,0,1.528,1.008c3.675,5.502,2.161,25.852-5.299,43.842l-1.53,3.69l3.97,0.44
+ c11.498,1.277,20.363,5.538,27.101,13.025l1.855,2.061l2.2-1.687c14.329-10.985,26.298-25.655,34.612-42.423
+ c7.457-15.037,11.562-31.003,11.562-44.958c0-5.936,0-24-18-24c-15.847,0-37.457,7.883-54.821,14.218
+ c-1.838,0.67-3.611,1.317-5.304,1.927c-0.479-1.517-0.963-3.148-1.464-4.836C236.714,135.658,230.964,116.277,213.538,116.277
+ L213.538,116.277z"/>
+ </g>
+</g>
+<g>
+ <g>
+ <path fill-rule="evenodd" clip-rule="evenodd" fill="#FBE500" d="M240.5,158.5c-5-14-9-42-30-39c-6.17,1.17-9.5,24.83-13,45
+ c-5-6-24.83-39-45-39c-11.5,0-24,9-24,19v6c0,2,2,10,2,13c0,2-1,2-1,2c-14,2-32.9,5-45.5,6.5c-18.17,2.17-40.33,5-37.5,25.5
+ c2.74,19.811,12,56,41,66c15,6,49,5,56,3c3-6,8-17,8-19c0-4-14-11-14-29c0-4-1-10,6-13c3,4,5,7,6,11c2,5,4.5,16.17,13.5,21.17
+ c5,3,11.5,4.83,15.5,8.83c2,2,0,7,0,10c0.83,14.83,7,21,10,33c2,7,3,16,4,23v22c0,18,17,46,44,46c10,0,38-16,42-16
+ c12,0-1,16,15,16c7,0,12-5,12-13s-10-14-10-18c0-2,6-11,6-14c0-5-4-9-12-9c-6,0-12,3-12,11c0,3,1,6,0,9c-8,0-16,1-22,1
+ c0,1-1,20-11,20c-3,0-6-3-6-8c0-3-0.83-8.5,8-12c0,0,2.5-6.5-3-6c-2.99,0.27-19,10-21,10c-3,0-6-1-6-7c0-12,20-6,20-16
+ c0-4-4-5-6-5c-3,0-9,2-11,2c-3,0-8-1-8-7c0-9,8-9,14-9c2,0,7,0,10,1c3-11,7.5-30.5,7.5-33.5c0-18-10.5-26.5-10.5-44.5
+ c0-2,6-23,8-34c1-3-2-14,4-14c9.83,0,7.5,28.5-1,49c9,1,20,4,29,14c30-23,45-59,45-85c0-10-2-21-15-21
+ C284.5,141.5,257.5,152.5,240.5,158.5z"/>
+ </g>
+ <defs>
+ <filter id="Adobe_OpacityMaskFilter_1_" filterUnits="userSpaceOnUse" x="46.254" y="119.277" width="271.246" height="261.223">
+
+ <feColorMatrix type="matrix" values="-1 0 0 0 1 0 -1 0 0 1 0 0 -1 0 1 0 0 0 1 0" color-interpolation-filters="sRGB" result="source"/>
+ </filter>
+ </defs>
+ <mask maskUnits="userSpaceOnUse" x="46.254" y="119.277" width="271.246" height="261.223" id="SVGID_2_">
+ <g filter="url(#Adobe_OpacityMaskFilter_1_)">
+
+ <image overflow="visible" width="278" height="268" xlink:href="data:image/jpeg;base64,/9j/4AAQSkZJRgABAgEASABIAAD/7AARRHVja3kAAQAEAAAAHgAA/+4AIUFkb2JlAGTAAAAAAQMA
+EAMCAwYAAARTAAAJlwAADlr/2wCEABALCwsMCxAMDBAXDw0PFxsUEBAUGx8XFxcXFx8eFxoaGhoX
+Hh4jJSclIx4vLzMzLy9AQEBAQEBAQEBAQEBAQEABEQ8PERMRFRISFRQRFBEUGhQWFhQaJhoaHBoa
+JjAjHh4eHiMwKy4nJycuKzU1MDA1NUBAP0BAQEBAQEBAQEBAQP/CABEIAQwBFgMBIgACEQEDEQH/
+xACaAAEAAgMBAQAAAAAAAAAAAAAABgcDBAUBAgEBAAAAAAAAAAAAAAAAAAAAABAAAgICAQMEAgEE
+AwEAAAAAAgMBBAUGACARExAwQBIxFBWAITM0IjI1FhEAAgIBAQYFAgUEAwEAAAAAAQIAESEDIDFB
+URIiEDBAYXGRE4GxMlIjocFCYuFyMwQSAQAAAAAAAAAAAAAAAAAAAID/2gAMAwEAAhEDEQAAAK/A
+AAAAPs+Hf7BCEqjprgAzdPrTsp7WtOtjVAAAAAAAAAAB7N4nbRubf16YI/J/kpblXDWJzPr52iy5
+VyeuYa5suOlRMuIAPreOekfSIUm8eOSAAAAADcuCmLhO0AD5i8qxlGb8v5pYG3jyDT3Pkprj27rF
+ed+fbpGOz0fTBk+xjjUp5RTzeHHMhjd7tEH+rK3yrNi19oqres3KQSbbHoAAB8fOUeegB4D0AADl
+dXglatIY7DidrDZ+x49AAAAAAAADz35OBwNWGl65+F3QADyGS2ryLvB3bZpi3zpAAAAeOEdfNT1j
+nbeegAADFl0yt4r1eYWzI+B3wB57iORU0qhQB92vUs4LH9+PsAAA8gU9hJW0yhvQLsycnqnoAAHD
+7cMK6y6fcLQ6mlug8Ee6FYHK1QAdLmi7OnXc/MwAAHG7OMo7Un0DJfP6Q7RcnsQlRlAB81xZFekC
+6vKFmyaju0XFqRThn3EffkAAA2LIq/aLxywKVnSYsh689Hjw5VU2PVZhBktyobWJQ89APIxKNApD
+563JAPv4AAAAAD66fKEw6tdC0c1Uelq6la+EhjwALKrWUlre4cwA+PvwraE2ZWYAAAAAAAAAAAAA
+2tUXP2YNOD0Dz34IdWc2hIAAAAAAAAAAAAABK7Rp23DaeaxtamnxiG8HZ1gAAAAAAAAAAAAADoXD
+TtwGSrrGp0+vnD6eAAAAAAAAAAAAAAA37gp63jfiMy4RCND65Bh8ABlxSYxa9p8Qq/zPgAAAAAAA
+AAAMtsVFNiya9n3GKd+5Z0iFa3Y4g++hPitpvKugZIHPa6IMAAAAAAAAAABt6gtuR0tY5IdfL9lP
+8KyYodGw4VjJxrVZoF687hSMqXky2JAAAAAAAAAAADb1BM+3WP0T+O8L5NrVADu9+B/Rv84AP//a
+AAgBAgABBQD+jL//2gAIAQMAAQUA/oy//9oACAEBAAEFAPiVqrLJ/wDzlmRtULFWfjqUxx0dWsP4
+GmB9bunmuLdGxULo1TF+QVYlfjzWBWasjSOnY+KAyZa1r49quOUoIUuONqKZGY15Tgy2EfRZ6LH7
+HqtSAREdosKhq9wxfaPi4oYO9gkCKfUhgozOHW9eZxTaL+YxXlu4JP0r+my0oaiyrw2PUFsZKMJf
+fyvp9lnE6SMcdpixHJ4N1L3MSUDfwhRNfoMYMdiwgWFX6TKT9ZT5chjl/RHpkUeVGz05rXhAjmrg
+r1maGlSXKOqIVCMPXXAVEhyFBHDSso2HHBKf14/kPaqlIWNdkpq9LlC0Nn1ybAahhLiXpD6L9CGC
+jL6xXyBVNQrJmviEJgErDqzYxKCGP5/phbJ4NG2fF4LIslWq3jlGlOKcfo6QZSqDWV1GsGQuupc+
+7my7VyKP5/ia7nlS1W0/lbSA7I02uMK1auPF6/WHgYmuPBooHgoUPIEY97v25BDPsbG6Ar+aP5Kn
+VK0/A68sARj0qGFhHO0fE2HPDjk4fdP2rFWwL1dMz2jb7sAj7T9tVUJ2scoQT8U57DvbJkaxkuxr
+b5ZW6bTIWrcL3kZzVGwFygX2R7JFAx+2n7RMFHsvL6q3V4kxX+TV/wDW6c9eFKcnZmzb5hH+G/h3
+Qyv7Ow5T9NC9rvxcwWVG2n2ck3xo2Sz5r6Bk360uRrdFhsKXt+W/t6JOVt1e3DEexP43k5/X5peR
+IeJODX7Gw2IXXut81rEpl1/CK+lf1mYiNgyoIVkbhW7PrpeQ/wCCjgw65/G61SOvzC3Jq3cNdFye
+ufxuVvx15mZnV0fa3jfrCfXKZAK6tkzJWndGDvTUuYe6L0+xnqUWK+TqFUtxMxOs7DAcpZNTwgoK
+Ok/+u9sKB5iMkunOJ2ZBRWySXRBhMXb60hs+fI5mZKeiJmJ1PN9xruFodblwwNswXkgwJZCZAWN2
+W1UnC7SmzCXC4Ogv7jvNeSV6Aw1ljdmtVSr7OJqzWzkcMYbD6qVtlR+vZ8HLS4Gj15pYSrOisbfo
+h7a7NXtm+r07VT8tdgStnqDmBEzMz7FDIOpMwm1LZFXLJbAvWfIKJ6CKBjYsgIJuPl9j0X/k1WYi
+v05WvDUbFTmtd94DMCp7BdrTU3SR5X3RBcHca3A22sUM22uPH7fXkc7nf2o9YntOn24NET3joaP2
+XulKIH4cEQ8kiLr06/421WQxXRP43Bcfr/LxtqatvA3IfX6J/G4tiK/zNLvSxET3j1YX1Dd7UyPz
+NKsyLUF9let90LTtVry2/mas2V36B/ZH44++hPGZ6vHMrnFmvIv89v5mDKRyOJnvXyVr9dGc2S06
+zN+5PJt2S5M95+Zhf/Qw/wDr7Aozq21GqzztPzsL/wChh/8AXekXBmdarNJmDrom3WSIlEQXRXrs
+sMRq7DC7r7a8EMjPxMPPa/hSia/M/fVWXkdg8putub1alUFxV8cEKzyFrXckZs/ErM8VjWrcMRP4
+302Qri1MZMUCGGiIl2meCppTFC4XNIxtha+31XueQ8ITMzPxdPyv9kMhi8/hAyCo0ZgtXra6q86f
+gZ+eYOn+zYx+upIVYGsPEVVIg47ju+Naz4+NulTs4DMLeoSEx8YcuVxJO2IJd/mp0pCKrVLW7K11
+cDYKpGl4OHMUQerP4/8AUs/GwuZOgzD59TwVYWyD+shs2GVchWBhTatlVQLm1Aobuw3LMjcsizVs
+wTq9myBK2wgkfj0sjZpljdwiIXtaTG9sKCG3nQmX5Cw7kzM+uCysVodsQeLLZGbjPkj5OF5OqO/e
+fJ29f//aAAgBAgIGPwAZf//aAAgBAwIGPwAZf//aAAgBAQEGPwD0nQg+TOoE/SfyLjn6gJpi2MB1
+Lo8BMpmE6dgzp1Vxz2RqMMtmCxG7Y2mR232+mCLvJoRXZbY5JMGJulERqUG4zAE6d/TxVeZAiY4C
+VCCI2qq5XPptMGKa4bFGN23cY1/GT9PDSX3uL8eL43iPp/tONikUsfYQUnSDzgLk+4EtgT8w0kLL
+ZUbx5mmTzqL8bJBjdt3G0mBr/EwGr6azF+PFh7QtVB5SgseQgpOkHnAdW2+YOwfSDtEws3SiIxrh
+PsVjrqvL02G8MIhPLaKkRm017t4qM/8A9Gn0d2PwgXxIPGXqIGo2IKQCvaDtEwNpviIP9v7HawhP
+4GDp0mz7QD7dA8Z3YHsJ3kmKzr1UQRed0CDgNumFy1WvOb4iHh1f2Ph06SljAdSwOQnepPzAPtjH
+tB2D6T9In6RP0iYWYHn4PkN8T7vD7n/EXSXjvikrBgTA9Kz3u4T7epaEnAPGBhtEx88DOrjdw3zE
+FDh6Yyv9h+c03XeGES+W0TPtA7znwKnjRi/HlWTQnT1C5Yz5TGBOJMT/ALD84nwNps1iO92AaHgh
+ug2Ivx5TMDVCfcZv4i27kIpu7HlN8Qi7CzTUbywiXy2SxjaaNlsDxRx/iQYmeA8kxxw8Bosf0moD
+5LZ4TUe7tjU0l5G4vxsWY3dVCNqE2t9uwumxyuICPJ1K5HwVrpWwYueHkvngZZ3mfcO4YEAHLYOa
+jaKHHE7K5pWOfmLnh5LCrsR9MigSSssbxF0tRqYc4O4Swb2jKB3nPgOrHvAvWPrBTCXcOYdLSbuM
+JJsnedmxvG6Lps3cuDAQfIKmNqIveMgwo4phvEDIaYbiIBqEso4iKOsXygZTsmM37Tf08epGKnmI
+q6p6l5wHq4RtPSa2MLubY7ztrqIaF9wijqgIPkNfKHp35vxGppMVYHhxiF95A2nxwMZDvUkbBCsQ
+DwlnJ8kOhPTxWBWajxBg7hMGYOxZMbPCPqHiceK/I/OIByG02OELcH/Pz+pCVPMTJ6hANQlT7yi4
++s/9B9Zhx9Zlx9YQNQfWFNNrvYsbxEzeBAdkiM4GVN+kwSPiZJPzt/ZY7jj4gO059j6xNQbrAMXO
+8bTj2PrUBOaowHYJhQcTXrTp8AfzinYOeECXus+tq8Govx4dzCYYRgrR3969bp1F+Ize0fT0WpVN
+EzOs07tQmWfW6cX4jheU1EcUwY/1Phu9dpxfiFWhcoLhpRCMQgbtkJpizxMtruFlvHAwqcEb/S6Z
+i/HgzMaqEaORz4TuOOW11EWbgxwjYj9O6/S6b8iImeHgQDQJAP18KQXL1Me0oTEpUJJ9pjRY/hOr
+WQoSTgz4EZQe44Es7z6ZdNjlcGAiMpF3MsxS90wtVPtJgnwyLAxASggtRKQVCJ91QT0G69OuoD23
+3Re67EsZE3RqHCAkdpsX4DUcUWNwXMsJ0dYuWpuNYuxCyilY59OFY/x3v5Re4G5YMIuHnvBEvUPU
+BwMAsCoQrWeQhCsUX+sGqNVuoG95iFzmsw54Rq3+oB02PT+2BdRuk+8/WPrCeoQ/byfaV1dI9pZy
+fEIxqp+rhKBtR6rsv8Lndde97WN8zde97H//2Q==" transform="matrix(1 0 0 1 43 116)">
+ </image>
+ </g>
+ </mask>
+ <g mask="url(#SVGID_2_)">
+ <path fill-rule="evenodd" clip-rule="evenodd" fill="#CEBC01" d="M240.5,158.5c-5-14-9-42-30-39c-6.17,1.17-9.5,24.83-13,45
+ c-5-6-24.83-39-45-39c-11.5,0-24,9-24,19v6c0,2,2,10,2,13c0,2-1,2-1,2c-14,2-32.9,5-45.5,6.5c-18.17,2.17-40.33,5-37.5,25.5
+ c2.74,19.811,12,56,41,66c15,6,49,5,56,3c3-6,8-17,8-19c0-4-14-11-14-29c0-4-1-10,6-13c3,4,5,7,6,11c2,5,4.5,16.17,13.5,21.17
+ c5,3,11.5,4.83,15.5,8.83c2,2,0,7,0,10c0.83,14.83,7,21,10,33c2,7,3,16,4,23v22c0,18,17,46,44,46c10,0,38-16,42-16
+ c12,0-1,16,15,16c7,0,12-5,12-13s-10-14-10-18c0-2,6-11,6-14c0-5-4-9-12-9c-6,0-12,3-12,11c0,3,1,6,0,9c-8,0-16,1-22,1
+ c0,1-1,20-11,20c-3,0-6-3-6-8c0-3-0.83-8.5,8-12c0,0,2.5-6.5-3-6c-2.99,0.27-19,10-21,10c-3,0-6-1-6-7c0-12,20-6,20-16
+ c0-4-4-5-6-5c-3,0-9,2-11,2c-3,0-8-1-8-7c0-9,8-9,14-9c2,0,7,0,10,1c3-11,7.5-30.5,7.5-33.5c0-18-10.5-26.5-10.5-44.5
+ c0-2,6-23,8-34c1-3-2-14,4-14c9.83,0,7.5,28.5-1,49c9,1,20,4,29,14c30-23,45-59,45-85c0-10-2-21-15-21
+ C284.5,141.5,257.5,152.5,240.5,158.5z"/>
+ </g>
+</g>
+<path fill-rule="evenodd" clip-rule="evenodd" fill="#FFFFCC" d="M168.67,263.33c-3.67-2-6.67-3.33-9-6.33
+ c-5,9-11.17,30.5-11.17,41.5c0,3,1,10,2,15C177.67,289,168.67,263.33,168.67,263.33z"/>
+<g>
+ <path fill-rule="evenodd" clip-rule="evenodd" fill="#FF6600" d="M193.772,206.837c-5.358,0-10.236-2.729-13.736-7.683l-0.198-0.28
+ l-0.093-0.33c-8.547-30.246-25.982-48.151-39.992-62.539c-2.949-3.03-5.736-5.89-8.24-8.667l-0.94-1.043l0.662-1.238
+ c3.588-6.719,10.431-10.272,19.783-10.272c5.169,0,10.029,1.066,13.196,1.96c2.665,0.75,5.5,1.129,8.429,1.129
+ c0.004,0,0.006,0,0.01,0c7.256,0,14.981-2.283,22.334-6.601c2.978-1.746,6.236-2.632,9.686-2.632
+ c6.564,0,11.543,3.219,11.753,3.357l1.181,0.775l-0.336,1.373c-4.887,19.923-7.7,46.495-8.604,81.235l-0.006,0.27l-0.078,0.255
+ C206.643,202.342,200.553,206.835,193.772,206.837L193.772,206.837z"/>
+ <path fill="#917013" d="M204.676,110.643c6.042,0,10.654,3.027,10.654,3.027c-4.33,17.66-7.66,43.26-8.66,81.66
+ c-1.729,5.729-7.115,9.506-12.899,9.506c-4.249,0-8.713-2.037-12.101-6.836c-10.51-37.2-34.41-56.19-48.67-72
+ c3.897-7.297,11.292-9.214,18.019-9.214c5.322,0,10.226,1.199,12.651,1.884c2.928,0.824,5.941,1.206,8.975,1.206
+ c8.011,0,16.174-2.662,23.355-6.876C198.988,111.248,201.975,110.643,204.676,110.643 M204.677,106.643L204.677,106.643
+ c-3.812,0-7.412,0.979-10.701,2.907c-7.053,4.139-14.428,6.327-21.332,6.327c-2.745,0-5.4-0.355-7.892-1.057
+ c-3.285-0.927-8.337-2.033-13.734-2.033c-10.138,0-17.589,3.917-21.547,11.33l-1.323,2.478l1.881,2.086
+ c2.528,2.803,5.326,5.676,8.289,8.718c13.853,14.225,31.094,31.929,39.502,61.69l0.187,0.659l0.396,0.561
+ c3.883,5.5,9.342,8.528,15.369,8.528c7.655,0,14.534-5.078,16.729-12.35l0.155-0.515l0.014-0.537
+ c0.889-34.117,3.764-61.306,8.546-80.812l0.673-2.746l-2.363-1.551C217.296,110.176,211.832,106.643,204.677,106.643
+ L204.677,106.643z"/>
+</g>
+<g>
+ <g>
+ <path fill-rule="evenodd" clip-rule="evenodd" fill="#FF6600" d="M215.33,113.67c-4.33,17.66-7.66,43.26-8.66,81.66
+ c-3,9.939-17,14-25,2.67c-10.51-37.2-34.41-56.19-48.67-72c6.98-13.07,25.18-8.88,30.67-7.33c10.66,3,22.43,0.14,32.33-5.67
+ C205.67,107.33,215.33,113.67,215.33,113.67z"/>
+ </g>
+ <defs>
+ <filter id="Adobe_OpacityMaskFilter_2_" filterUnits="userSpaceOnUse" x="133" y="110.643" width="82.33" height="94.193">
+
+ <feColorMatrix type="matrix" values="-1 0 0 0 1 0 -1 0 0 1 0 0 -1 0 1 0 0 0 1 0" color-interpolation-filters="sRGB" result="source"/>
+ </filter>
+ </defs>
+ <mask maskUnits="userSpaceOnUse" x="133" y="110.643" width="82.33" height="94.193" id="SVGID_3_">
+ <g filter="url(#Adobe_OpacityMaskFilter_2_)">
+
+ <image overflow="visible" width="87" height="99" xlink:href="data:image/jpeg;base64,/9j/4AAQSkZJRgABAgEASABIAAD/7AARRHVja3kAAQAEAAAAHgAA/+4AIUFkb2JlAGTAAAAAAQMA
+EAMCAwYAAAIPAAADBQAAA/v/2wCEABALCwsMCxAMDBAXDw0PFxsUEBAUGx8XFxcXFx8eFxoaGhoX
+Hh4jJSclIx4vLzMzLy9AQEBAQEBAQEBAQEBAQEABEQ8PERMRFRISFRQRFBEUGhQWFhQaJhoaHBoa
+JjAjHh4eHiMwKy4nJycuKzU1MDA1NUBAP0BAQEBAQEBAQEBAQP/CABEIAGMAVwMBIgACEQEDEQH/
+xACPAAEAAgMBAQAAAAAAAAAAAAAABgcCAwUBBAEBAAAAAAAAAAAAAAAAAAAAABAAAQQBAwMDBQEA
+AAAAAAAAAwECBAYFABAgETESUCETMDIjMxQ0EQACAQEGAwgDAQAAAAAAAAABAgARECAhMUEDcRIi
+MFFhgZGhMkJigrITEgEAAAAAAAAAAAAAAAAAAABQ/9oADAMBAAIRAxEAAACv2ySEXWJ8xBEowI1n
+MZGQLbaXOKmfaNVkVRIS3Ped0jW2jDL0OH24uVm+YYgk1lUhMSzffm+kA8hE2rwggAGeAsia0lbB
+2HnphWlk1YRcAACawr7i7tnJ6xpqi1anI+AAACxJvS0zJXU0ihhpAAAA2BjiAH//2gAIAQIAAQUA
+9K//2gAIAQMAAQUA9K//2gAIAQEAAQUA5iCUzolalGSTWXiaSK8ZwAed+Oq7TIyoBVkmkjVCUuQj
+kpkpVh0j3gVUAdCxYRtzEQYxS3IuZxUhgj4MgSNY1nirGLpY4l1/MLSDY3exERkd5PLJ6r+efGLi
+8kOSPlbDeEfz/JtWs+QBMdPZIHwXtdJHhH3RVatWsDmrEktOPd/23cifFwCV4SVTOIcY3o9uxPZl
+4d15YbIOhSsJkGyA7SF6CuhXKflTcu7QSIQepX6bj/q5YeUsWbhJaGBqYvQFtIjpnJFVFqOU8gjM
+x7clIY0Nkej5/PEZR0EsWzj+PKWZijlSHSDfQH2J32//2gAIAQICBj8AK//aAAgBAwIGPwAr/9oA
+CAEBAQY/AL/LtqWPhAz1A7hKioMXZObMFHmaQInmYC45ie+U5B6Q8q0PhDysaT5H0gO6C3GDoA8p
+QARjTSbQ0G4n9CAPqc4tKQUExE+M+MwFrcINyuH+qmvAixdrdbDQwY1rffgZz/lze9bRs7rYaEwY
+1umPwNwMpoRkYuzut1CAg3DGBOeF1dxDRlNYqserIiBhraZT8heU16GIBi41qLWgXQm+Nl26lwgY
+WNF4m+jaMaGLjpY0C61JvgjMZRAxxgNYwrpCR49gAT0EwdfvCA2cbcbXLsfv+s+37W//2Q==" transform="matrix(1 0 0 1 131 108)">
+ </image>
+ </g>
+ </mask>
+ <g opacity="0.6" mask="url(#SVGID_3_)">
+ <path fill-rule="evenodd" clip-rule="evenodd" fill="#7F3E03" d="M215.33,113.67c-4.33,17.66-7.66,43.26-8.66,81.66
+ c-3,9.939-17,14-25,2.67c-10.51-37.2-34.41-56.19-48.67-72c6.98-13.07,25.18-8.88,30.67-7.33c10.66,3,22.43,0.14,32.33-5.67
+ C205.67,107.33,215.33,113.67,215.33,113.67z"/>
+ </g>
+</g>
+<path opacity="0.25" fill-rule="evenodd" clip-rule="evenodd" fill="#FFFFFF" d="M210.936,113.796
+ c-11.983,64.227-22.738,60.791-73.726,11.721c0.148-11.045,22.734-5.193,27.431-4c9.14,2.331,19.844,0.864,27.954-4.462
+ C202.85,110.315,210.936,113.796,210.936,113.796z"/>
+<path fill-rule="evenodd" clip-rule="evenodd" fill="#FFFFCC" d="M281.5,290.5c-7.17-37.17-37.17-42.83-37.17-42.83
+ c3,10,6.34,19.33,9.17,27.83C261,282,273.5,289.5,281.5,290.5z"/>
+<path fill-rule="evenodd" clip-rule="evenodd" fill="#FFFFCC" d="M168.67,263.33c-3.67-2-6.67-3.33-9-6.33
+ c-5,9-11.17,30.5-11.17,41.5c0,3,1,10,2,15C177.67,289,168.67,263.33,168.67,263.33z"/>
+<path fill-rule="evenodd" clip-rule="evenodd" fill="#FFFFCC" d="M281.5,290.5c-7.17-37.17-37.17-42.83-37.17-42.83
+ c3,10,6.34,19.33,9.17,27.83C261,282,273.5,289.5,281.5,290.5z"/>
+<path fill-rule="evenodd" clip-rule="evenodd" d="M166.77,188.01c5.25,0.61,8.37,11.49,9.67,19.44c1.33,8.17,1.33,16.76-4.05,17.47
+ c-8.06,1.08-11.67-21.93-11.67-21.93C158.28,187.29,166.77,188.01,166.77,188.01z"/>
+<path fill-rule="evenodd" clip-rule="evenodd" d="M229.86,192.56c0.99,10.209-3.431,23.959-6.57,24.39
+ c-6.29,0.85-7.51-9.05-7.72-10.7c-0.41-3.3-3.061-24.76,7.939-26.25C228.33,182,229.45,189.26,229.86,192.56z"/>
+<path opacity="0.1" fill-rule="evenodd" clip-rule="evenodd" fill="#FFFFFF" d="M216.51,195.85c0.93-8.26,11.79-5.08,11.79,2.86
+ c0,7.95-2.1,14.261-4.34,16.21C217.75,220.32,215.58,204.12,216.51,195.85z"/>
+<path opacity="0.1" fill-rule="evenodd" clip-rule="evenodd" fill="#FFFFFF" d="M163.09,206.33c-1.19-8.13,9.59-8.43,11.57-0.891
+ c1.97,7.551,1.6,14.181,0.02,16.721C170.3,229.18,164.28,214.45,163.09,206.33z"/>
+<rect x="701" y="306" fill-rule="evenodd" clip-rule="evenodd" fill="#FBE500" stroke="#1F1F1F" stroke-width="20" stroke-linecap="round" stroke-linejoin="round" width="14" height="34"/>
+<circle fill-rule="evenodd" clip-rule="evenodd" fill="#FFFF33" cx="182.5" cy="139.5" r="11.5"/>
+<g>
+ <g>
+ <path fill-rule="evenodd" clip-rule="evenodd" fill="#23A9FF" d="M149.33,127.79c0,14.21-17,14.21-17,14.21
+ c-14.16,0-14.9-11.46-14.16-14.21c2.16-8.12,3.83-13.12,15.16-13.12C139,114.67,149.33,119.26,149.33,127.79z"/>
+ <path fill="none" stroke="#000000" stroke-width="3" stroke-linecap="round" stroke-linejoin="round" d="M149.33,127.79
+ c0,14.21-17,14.21-17,14.21c-14.16,0-14.9-11.46-14.16-14.21c2.16-8.12,3.83-13.12,15.16-13.12
+ C139,114.67,149.33,119.26,149.33,127.79z"/>
+ </g>
+ <defs>
+ <filter id="Adobe_OpacityMaskFilter_3_" filterUnits="userSpaceOnUse" x="116.477" y="113.17" width="34.353" height="30.33">
+
+ <feColorMatrix type="matrix" values="-1 0 0 0 1 0 -1 0 0 1 0 0 -1 0 1 0 0 0 1 0" color-interpolation-filters="sRGB" result="source"/>
+ </filter>
+ </defs>
+ <mask maskUnits="userSpaceOnUse" x="116.477" y="113.17" width="34.353" height="30.33" id="SVGID_4_">
+ <g filter="url(#Adobe_OpacityMaskFilter_3_)">
+
+ <image overflow="visible" width="39" height="35" xlink:href="data:image/jpeg;base64,/9j/4AAQSkZJRgABAgEASABIAAD/7AARRHVja3kAAQAEAAAAHgAA/+4AIUFkb2JlAGTAAAAAAQMA
+EAMCAwYAAAGnAAAB+QAAAmr/2wCEABALCwsMCxAMDBAXDw0PFxsUEBAUGx8XFxcXFx8eFxoaGhoX
+Hh4jJSclIx4vLzMzLy9AQEBAQEBAQEBAQEBAQEABEQ8PERMRFRISFRQRFBEUGhQWFhQaJhoaHBoa
+JjAjHh4eHiMwKy4nJycuKzU1MDA1NUBAP0BAQEBAQEBAQEBAQP/CABEIACMAJwMBIgACEQEDEQH/
+xAB9AAEAAgMBAAAAAAAAAAAAAAAABgcBBAUDAQEAAAAAAAAAAAAAAAAAAAAAEAACAwEAAwEBAAAA
+AAAAAAADBAECBQYQMBEAExEBAAIBAwMDBQAAAAAAAAAAAQACETFBAxBxEiGBkcEiMhMEEgEAAAAA
+AAAAAAAAAAAAAAAw/9oADAMBAAIRAxEAAACAdvxtYgHEurklMuyNm1aPm5YOlHo4aqPjzBnAAf/a
+AAgBAgABBQD0/wD/2gAIAQMAAQUA9P8A/9oACAEBAAEFAIibTncyy3BOKvFH8NxOfk/edThlzMzx
+CDIRzGvlhIJ7PgO1yJKUZSJW4f2kwMYdRql91Nu6h8rrhQMnYLRXY67+1bHJY/ifP//aAAgBAgIG
+PwAf/9oACAEDAgY/AB//2gAIAQEBBj8AAMroQtfIOxM1yMVq2qb7zG8GxkrKvjtMeJLPiaTg4g+3
+l5aVx3sER1zK4elhdp/JjSvPxq9rkOWm2pAvfCajPzPmWpwvks/eubli3uevU+vX/9k=" transform="matrix(1 0 0 1 114 111)">
+ </image>
+ </g>
+ </mask>
+ <g mask="url(#SVGID_4_)">
+ <path fill-rule="evenodd" clip-rule="evenodd" fill="#043C96" d="M149.33,127.79c0,14.21-17,14.21-17,14.21
+ c-14.16,0-14.9-11.46-14.16-14.21c2.16-8.12,3.83-13.12,15.16-13.12C139,114.67,149.33,119.26,149.33,127.79z"/>
+ <path fill="none" stroke="#043C96" stroke-width="3" stroke-linecap="round" stroke-linejoin="round" d="M149.33,127.79
+ c0,14.21-17,14.21-17,14.21c-14.16,0-14.9-11.46-14.16-14.21c2.16-8.12,3.83-13.12,15.16-13.12
+ C139,114.67,149.33,119.26,149.33,127.79z"/>
+ </g>
+</g>
+<g>
+ <g>
+ <path fill-rule="evenodd" clip-rule="evenodd" fill="#23A9FF" d="M230.33,111.33c3,4.84,4.68,17.12-15.33,16.17
+ c-7-0.33-11.35-13.81-7.33-17.83C211.67,103,226,104.33,230.33,111.33z"/>
+ <path fill="none" stroke="#000000" stroke-width="3" stroke-linecap="round" stroke-linejoin="round" d="M230.33,111.33
+ c3,4.84,4.68,17.12-15.33,16.17c-7-0.33-11.35-13.81-7.33-17.83C211.67,103,226,104.33,230.33,111.33z"/>
+ </g>
+ <defs>
+ <filter id="Adobe_OpacityMaskFilter_4_" filterUnits="userSpaceOnUse" x="204.631" y="103.813" width="29.007" height="25.239">
+
+ <feColorMatrix type="matrix" values="-1 0 0 0 1 0 -1 0 0 1 0 0 -1 0 1 0 0 0 1 0" color-interpolation-filters="sRGB" result="source"/>
+ </filter>
+ </defs>
+ <mask maskUnits="userSpaceOnUse" x="204.631" y="103.813" width="29.007" height="25.239" id="SVGID_5_">
+ <g filter="url(#Adobe_OpacityMaskFilter_4_)">
+
+ <image overflow="visible" width="34" height="31" xlink:href="data:image/jpeg;base64,/9j/4AAQSkZJRgABAgEASABIAAD/7AARRHVja3kAAQAEAAAAHgAA/+4AIUFkb2JlAGTAAAAAAQMA
+EAMCAwYAAAGWAAAB3QAAAkb/2wCEABALCwsMCxAMDBAXDw0PFxsUEBAUGx8XFxcXFx8eFxoaGhoX
+Hh4jJSclIx4vLzMzLy9AQEBAQEBAQEBAQEBAQEABEQ8PERMRFRISFRQRFBEUGhQWFhQaJhoaHBoa
+JjAjHh4eHiMwKy4nJycuKzU1MDA1NUBAP0BAQEBAQEBAQEBAQP/CABEIAB8AIgMBIgACEQEDEQH/
+xAB4AAADAQEAAAAAAAAAAAAAAAAABQcGAwEBAAAAAAAAAAAAAAAAAAAAABAAAgIDAQEAAAAAAAAA
+AAAAAgMEBQABBiASEQACAQMDAwUAAAAAAAAAAAABAgAREgMQITFRsQRBcdEiYhIBAAAAAAAAAAAA
+AAAAAAAAIP/aAAwDAQACEQMRAAAAwTkqRLU1vnZkQBrUoy5KrPV6Y5gH/9oACAECAAEFAPX/2gAI
+AQMAAQUA9f/aAAgBAQABBQBSjccbl5Tgk8tMSLksSecugGya+CnSpUBJr6ysBesoJuosystUkmVa
+IBfU2i2awfr6iTrxYSLC/MH7cR5//9oACAECAgY/AF//2gAIAQMCBj8AX//aAAgBAQEGPwAJjFWM
+DEkE9BLlNfcQpkFrDQ3DgiA0h2EbIg+y76C40Dd4tWHENGEZFNSdhoLa3elOYBi8fK46hGPYSj+P
+mQdTjf4hOe6/9Cmn/9k=" transform="matrix(1 0 0 1 202 101)">
+ </image>
+ </g>
+ </mask>
+ <g mask="url(#SVGID_5_)">
+ <path fill-rule="evenodd" clip-rule="evenodd" fill="#043C96" d="M230.33,111.33c3,4.84,4.68,17.12-15.33,16.17
+ c-7-0.33-11.35-13.81-7.33-17.83C211.67,103,226,104.33,230.33,111.33z"/>
+ <path fill="none" stroke="#043C96" stroke-width="3" stroke-linecap="round" stroke-linejoin="round" d="M230.33,111.33
+ c3,4.84,4.68,17.12-15.33,16.17c-7-0.33-11.35-13.81-7.33-17.83C211.67,103,226,104.33,230.33,111.33z"/>
+ </g>
+</g>
+<path opacity="0.25" fill-rule="evenodd" clip-rule="evenodd" fill="#FFFFFF" d="M116,85c4-22.67,16.33-29.33,23.67-27.67
+ c7.33,1.67,20,11,30,11c12.33,0,16.66-3,23.66-8.66c7-5.67,10.31,2.33,10,12.33C203,83,207,91.67,204,92s-10.67-18-19-11
+ c-5.33,10.67-2,25.67-12.33,27c-6.7,0.86-21.67-3.67-35-19c-3.07-3.52-12-6-15,1c-3.33,7.75-3.34,4.67-5,8
+ C116.61,100.11,114.86,91.45,116,85z"/>
+<g>
+ <g>
+ <circle fill-rule="evenodd" clip-rule="evenodd" fill="#23A9FF" cx="169" cy="29" r="26"/>
+ <circle fill="none" stroke="#000000" stroke-width="3" stroke-linecap="round" stroke-linejoin="round" cx="169" cy="29" r="26"/>
+ </g>
+ <defs>
+ <filter id="Adobe_OpacityMaskFilter_5_" filterUnits="userSpaceOnUse" x="141.5" y="1.5" width="55" height="55">
+
+ <feColorMatrix type="matrix" values="-1 0 0 0 1 0 -1 0 0 1 0 0 -1 0 1 0 0 0 1 0" color-interpolation-filters="sRGB" result="source"/>
+ </filter>
+ </defs>
+ <mask maskUnits="userSpaceOnUse" x="141.5" y="1.5" width="55" height="55" id="SVGID_6_">
+ <g filter="url(#Adobe_OpacityMaskFilter_5_)">
+
+ <image overflow="visible" width="60" height="60" xlink:href="data:image/jpeg;base64,/9j/4AAQSkZJRgABAgEASABIAAD/7AARRHVja3kAAQAEAAAAHgAA/+4AIUFkb2JlAGTAAAAAAQMA
+EAMCAwYAAAHLAAACZwAAAyD/2wCEABALCwsMCxAMDBAXDw0PFxsUEBAUGx8XFxcXFx8eFxoaGhoX
+Hh4jJSclIx4vLzMzLy9AQEBAQEBAQEBAQEBAQEABEQ8PERMRFRISFRQRFBEUGhQWFhQaJhoaHBoa
+JjAjHh4eHiMwKy4nJycuKzU1MDA1NUBAP0BAQEBAQEBAQEBAQP/CABEIADwAPAMBIgACEQEDEQH/
+xACFAAACAwEBAQAAAAAAAAAAAAAABwIFBgQBAwEBAAAAAAAAAAAAAAAAAAAAABAAAQQBBAMBAAAA
+AAAAAAAAAgEDBAYFABARFCBAExIRAAEDAgQFBAMAAAAAAAAAAAEAEQJBEiAhMQMQUXGRImGhwWKx
+MhMSAQAAAAAAAAAAAAAAAAAAAED/2gAMAwEAAhEDEQAAAF/6bAorJk9gpKZ5Z8UxYV5aNtbNU+no
+BGQYVdN9TFy2Ua0TUEZB4cpQqvS5cO7hBi3ag+w0chmYEogf/9oACAECAAEFAPQ//9oACAEDAAEF
+APQ//9oACAEBAAEFANIiksKvzpWhpcpUkVGY0MmFIilsiKS1qtfXUPFMMAjDSaciMuJmq4xIby+M
+PHyNV+F2p2KhgwxuYoQ3HFibPC80sUWUwnDXhZwRY34XuVGQLUyI4jjPha5YhH/afaFJKLIrmbbf
+ZAxNNps1thu15rsObY3KyIDmKuDJiNnjKMq2RwHM2w5GnDNw9055HucH9uN//9oACAECAgY/AAf/
+2gAIAQMCBj8AB//aAAgBAQEGPwBAAOToEDbbE909x7ImJJPqFbvQI9acQAHJ0Cjvb0Xkc86IC0L9
+QmMQpeALoxY2HQ8uEXDxj+VFhTAQaqcgMxmFbXRlJ+YUemGfRW/f5RiTmSCokcsMw9Cr6XXe7qG9
+Ghz6KHlqE8S/EknNS2ISd9enEGBeD5hASmx5FPeESJjujDYLvWiM5l5HU4PHWjI2/wBGrqvO5vs/
+zg//2Q==" transform="matrix(1 0 0 1 139 -1)">
+ </image>
+ </g>
+ </mask>
+ <g mask="url(#SVGID_6_)">
+ <circle fill-rule="evenodd" clip-rule="evenodd" fill="#043C96" cx="169" cy="29" r="26"/>
+ <circle fill="none" stroke="#043C96" stroke-width="3" stroke-linecap="round" stroke-linejoin="round" cx="169" cy="29" r="26"/>
+ </g>
+</g>
+<path opacity="0.25" fill-rule="evenodd" clip-rule="evenodd" fill="#FFFFFF" d="M149,22.33c13.33-26.66,39.67-9,40.67,3.34
+ C190.67,38,141.58,37.17,149,22.33z"/>
+<rect x="337.5" y="105.5" fill-rule="evenodd" clip-rule="evenodd" fill="none" width="764" height="167"/>
+<text transform="matrix(1 0 0 1 337.5 191.7793)" fill="#1F1F1F" font-family="'Helvetica-Bold'" font-size="120" letter-spacing="-6">Powered by</text>
+</svg>

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/images/logos/favicon.ico
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/images/logos/favicon.ico b/community/mahout-mr/mr/src/images/logos/favicon.ico
new file mode 100644
index 0000000..4f5878d
Binary files /dev/null and b/community/mahout-mr/mr/src/images/logos/favicon.ico differ

Loading Image...
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/images/logos/favicon128.png b/community/mahout-mr/mr/src/images/logos/favicon128.png
new file mode 100644
index 0000000..a477d15
Binary files /dev/null and b/community/mahout-mr/mr/src/images/logos/favicon128.png differ

Loading Image...
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/images/logos/favicon16.png b/community/mahout-mr/mr/src/images/logos/favicon16.png
new file mode 100644
index 0000000..595b237
Binary files /dev/null and b/community/mahout-mr/mr/src/images/logos/favicon16.png differ

Loading Image...
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/images/logos/favicon32.png b/community/mahout-mr/mr/src/images/logos/favicon32.png
new file mode 100644
index 0000000..39668fd
Binary files /dev/null and b/community/mahout-mr/mr/src/images/logos/favicon32.png differ

Loading Image...
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/images/logos/favicon64.png b/community/mahout-mr/mr/src/images/logos/favicon64.png
new file mode 100644
index 0000000..5032b12
Binary files /dev/null and b/community/mahout-mr/mr/src/images/logos/favicon64.png differ

Loading Image...
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/images/logos/mahout-logo-100.png b/community/mahout-mr/mr/src/images/logos/mahout-logo-100.png
new file mode 100644
index 0000000..9868200
Binary files /dev/null and b/community/mahout-mr/mr/src/images/logos/mahout-logo-100.png differ

Loading Image...
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/images/logos/mahout-logo-200.png b/community/mahout-mr/mr/src/images/logos/mahout-logo-200.png
new file mode 100644
index 0000000..4ef5bdd
Binary files /dev/null and b/community/mahout-mr/mr/src/images/logos/mahout-logo-200.png differ

Loading Image...
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/images/logos/mahout-logo-300.png b/community/mahout-mr/mr/src/images/logos/mahout-logo-300.png
new file mode 100644
index 0000000..2fbd589
Binary files /dev/null and b/community/mahout-mr/mr/src/images/logos/mahout-logo-300.png differ

Loading Image...
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/images/logos/mahout-logo-400.png b/community/mahout-mr/mr/src/images/logos/mahout-logo-400.png
new file mode 100644
index 0000000..d9ac832
Binary files /dev/null and b/community/mahout-mr/mr/src/images/logos/mahout-logo-400.png differ

Loading Image...
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/images/logos/mahout-logo-poweredby-100.png b/community/mahout-mr/mr/src/images/logos/mahout-logo-poweredby-100.png
new file mode 100644
index 0000000..8f8af00
Binary files /dev/null and b/community/mahout-mr/mr/src/images/logos/mahout-logo-poweredby-100.png differ

Loading Image...
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/images/logos/mahout-logo-poweredby-55.png b/community/mahout-mr/mr/src/images/logos/mahout-logo-poweredby-55.png
new file mode 100644
index 0000000..9814d31
Binary files /dev/null and b/community/mahout-mr/mr/src/images/logos/mahout-logo-poweredby-55.png differ

Loading Image...
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr/src/images/logos/mahout-logo-transparent-400.png b/community/mahout-mr/mr/src/images/logos/mahout-logo-transparent-400.png
new file mode 100644
index 0000000..583436b
Binary files /dev/null and b/community/mahout-mr/mr/src/images/logos/mahout-logo-transparent-400.png differ
r***@apache.org
2018-06-28 14:55:07 UTC
Permalink
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/integration/pom.xml
----------------------------------------------------------------------
diff --git a/community/mahout-mr/integration/pom.xml b/community/mahout-mr/integration/pom.xml
index cb0c19a..8dbe599 100644
--- a/community/mahout-mr/integration/pom.xml
+++ b/community/mahout-mr/integration/pom.xml
@@ -25,7 +25,7 @@
<groupId>org.apache.mahout</groupId>
<artifactId>mahout</artifactId>
<version>0.13.1-SNAPSHOT</version>
- <relativePath>../pom.xml</relativePath>
+ <relativePath>../mr/pom.xml</relativePath>
</parent>

<artifactId>mahout-integration</artifactId>

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/bin/README.txt
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/bin/README.txt b/community/mahout-mr/mr-examples/bin/README.txt
new file mode 100644
index 0000000..7ad3a38
--- /dev/null
+++ b/community/mahout-mr/mr-examples/bin/README.txt
@@ -0,0 +1,13 @@
+This directory contains helpful shell scripts for working with some of Mahout's examples.
+
+To set a non-default temporary work directory: `export MAHOUT_WORK_DIR=/path/in/hdfs/to/temp/dir`
+ Note that this requires the same path to be writable both on the local file system as well as on HDFS.
+
+Here's a description of what each does:
+
+classify-20newsgroups.sh -- Run SGD and Bayes classifiers over the classic 20 News Groups. Downloads the data set automatically.
+cluster-reuters.sh -- Cluster the Reuters data set using a variety of algorithms. Downloads the data set automatically.
+cluster-syntheticcontrol.sh -- Cluster the Synthetic Control data set. Downloads the data set automatically.
+factorize-movielens-1m.sh -- Run the Alternating Least Squares Recommender on the Grouplens data set (size 1M).
+factorize-netflix.sh -- (Deprecated due to lack of availability of the data set) Run the ALS Recommender on the Netflix data set.
+spark-document-classifier.mscala -- A mahout-shell script which trains and tests a Naive Bayes model on the Wikipedia XML dump and defines simple methods to classify new text.

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/bin/classify-20newsgroups.sh
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/bin/classify-20newsgroups.sh b/community/mahout-mr/mr-examples/bin/classify-20newsgroups.sh
new file mode 100755
index 0000000..f47d5c5
--- /dev/null
+++ b/community/mahout-mr/mr-examples/bin/classify-20newsgroups.sh
@@ -0,0 +1,197 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#
+# Downloads the 20newsgroups dataset, trains and tests a classifier.
+#
+# To run: change into the mahout directory and type:
+# examples/bin/classify-20newsgroups.sh
+
+if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
+ echo "This script runs SGD and Bayes classifiers over the classic 20 News Groups."
+ exit
+fi
+
+SCRIPT_PATH=${0%/*}
+if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then
+ cd $SCRIPT_PATH
+fi
+START_PATH=`pwd`
+
+# Set commands for dfs
+source ${START_PATH}/set-dfs-commands.sh
+
+if [[ -z "$MAHOUT_WORK_DIR" ]]; then
+ WORK_DIR=/tmp/mahout-work-${USER}
+else
+ WORK_DIR=$MAHOUT_WORK_DIR
+fi
+algorithm=( cnaivebayes-MapReduce naivebayes-MapReduce cnaivebayes-Spark naivebayes-Spark sgd clean)
+if [ -n "$1" ]; then
+ choice=$1
+else
+ echo "Please select a number to choose the corresponding task to run"
+ echo "1. ${algorithm[0]}"
+ echo "2. ${algorithm[1]}"
+ echo "3. ${algorithm[2]}"
+ echo "4. ${algorithm[3]}"
+ echo "5. ${algorithm[4]}"
+ echo "6. ${algorithm[5]}-- cleans up the work area in $WORK_DIR"
+ read -p "Enter your choice : " choice
+fi
+
+echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]}"
+alg=${algorithm[$choice-1]}
+
+# Spark specific check and work
+if [ "x$alg" == "xnaivebayes-Spark" -o "x$alg" == "xcnaivebayes-Spark" ]; then
+ if [ "$MASTER" == "" ] ; then
+ echo "Please set your MASTER env variable to point to your Spark Master URL. exiting..."
+ exit 1
+ fi
+ if [ "$MAHOUT_LOCAL" != "" ] ; then
+ echo "Options 3 and 4 can not run in MAHOUT_LOCAL mode. exiting..."
+ exit 1
+ fi
+fi
+
+if [ "x$alg" != "xclean" ]; then
+ echo "creating work directory at ${WORK_DIR}"
+
+ mkdir -p ${WORK_DIR}
+ if [ ! -e ${WORK_DIR}/20news-bayesinput ]; then
+ if [ ! -e ${WORK_DIR}/20news-bydate ]; then
+ if [ ! -f ${WORK_DIR}/20news-bydate.tar.gz ]; then
+ echo "Downloading 20news-bydate"
+ curl http://people.csail.mit.edu/jrennie/20Newsgroups/20news-bydate.tar.gz -o ${WORK_DIR}/20news-bydate.tar.gz
+ fi
+ mkdir -p ${WORK_DIR}/20news-bydate
+ echo "Extracting..."
+ cd ${WORK_DIR}/20news-bydate && tar xzf ../20news-bydate.tar.gz && cd .. && cd ..
+ fi
+ fi
+fi
+#echo $START_PATH
+cd $START_PATH
+cd ../..
+
+set -e
+
+if ( [ "x$alg" == "xnaivebayes-MapReduce" ] || [ "x$alg" == "xcnaivebayes-MapReduce" ] || [ "x$alg" == "xnaivebayes-Spark" ] || [ "x$alg" == "xcnaivebayes-Spark" ] ); then
+ c=""
+
+ if [ "x$alg" == "xcnaivebayes-MapReduce" -o "x$alg" == "xnaivebayes-Spark" ]; then
+ c=" -c"
+ fi
+
+ set -x
+ echo "Preparing 20newsgroups data"
+ rm -rf ${WORK_DIR}/20news-all
+ mkdir ${WORK_DIR}/20news-all
+ cp -R ${WORK_DIR}/20news-bydate/*/* ${WORK_DIR}/20news-all
+
+ if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
+ echo "Copying 20newsgroups data to HDFS"
+ set +e
+ $DFSRM ${WORK_DIR}/20news-all
+ $DFS -mkdir -p ${WORK_DIR}
+ $DFS -mkdir ${WORK_DIR}/20news-all
+ set -e
+ if [ $HVERSION -eq "1" ] ; then
+ echo "Copying 20newsgroups data to Hadoop 1 HDFS"
+ $DFS -put ${WORK_DIR}/20news-all ${WORK_DIR}/20news-all
+ elif [ $HVERSION -eq "2" ] ; then
+ echo "Copying 20newsgroups data to Hadoop 2 HDFS"
+ $DFS -put ${WORK_DIR}/20news-all ${WORK_DIR}/
+ fi
+ fi
+
+ echo "Creating sequence files from 20newsgroups data"
+ ./bin/mahout seqdirectory \
+ -i ${WORK_DIR}/20news-all \
+ -o ${WORK_DIR}/20news-seq -ow
+
+ echo "Converting sequence files to vectors"
+ ./bin/mahout seq2sparse \
+ -i ${WORK_DIR}/20news-seq \
+ -o ${WORK_DIR}/20news-vectors -lnorm -nv -wt tfidf
+
+ echo "Creating training and holdout set with a random 80-20 split of the generated vector dataset"
+ ./bin/mahout split \
+ -i ${WORK_DIR}/20news-vectors/tfidf-vectors \
+ --trainingOutput ${WORK_DIR}/20news-train-vectors \
+ --testOutput ${WORK_DIR}/20news-test-vectors \
+ --randomSelectionPct 40 --overwrite --sequenceFiles -xm sequential
+
+ if [ "x$alg" == "xnaivebayes-MapReduce" -o "x$alg" == "xcnaivebayes-MapReduce" ]; then
+
+ echo "Training Naive Bayes model"
+ ./bin/mahout trainnb \
+ -i ${WORK_DIR}/20news-train-vectors \
+ -o ${WORK_DIR}/model \
+ -li ${WORK_DIR}/labelindex \
+ -ow $c
+
+ echo "Self testing on training set"
+
+ ./bin/mahout testnb \
+ -i ${WORK_DIR}/20news-train-vectors\
+ -m ${WORK_DIR}/model \
+ -l ${WORK_DIR}/labelindex \
+ -ow -o ${WORK_DIR}/20news-testing $c
+
+ echo "Testing on holdout set"
+
+ ./bin/mahout testnb \
+ -i ${WORK_DIR}/20news-test-vectors\
+ -m ${WORK_DIR}/model \
+ -l ${WORK_DIR}/labelindex \
+ -ow -o ${WORK_DIR}/20news-testing $c
+
+ elif [ "x$alg" == "xnaivebayes-Spark" -o "x$alg" == "xcnaivebayes-Spark" ]; then
+
+ echo "Training Naive Bayes model"
+ ./bin/mahout spark-trainnb \
+ -i ${WORK_DIR}/20news-train-vectors \
+ -o ${WORK_DIR}/spark-model $c -ow -ma $MASTER
+
+ echo "Self testing on training set"
+ ./bin/mahout spark-testnb \
+ -i ${WORK_DIR}/20news-train-vectors\
+ -m ${WORK_DIR}/spark-model $c -ma $MASTER
+
+ echo "Testing on holdout set"
+ ./bin/mahout spark-testnb \
+ -i ${WORK_DIR}/20news-test-vectors\
+ -m ${WORK_DIR}/spark-model $c -ma $MASTER
+
+ fi
+elif [ "x$alg" == "xsgd" ]; then
+ if [ ! -e "/tmp/news-group.model" ]; then
+ echo "Training on ${WORK_DIR}/20news-bydate/20news-bydate-train/"
+ ./bin/mahout org.apache.mahout.classifier.sgd.TrainNewsGroups ${WORK_DIR}/20news-bydate/20news-bydate-train/
+ fi
+ echo "Testing on ${WORK_DIR}/20news-bydate/20news-bydate-test/ with model: /tmp/news-group.model"
+ ./bin/mahout org.apache.mahout.classifier.sgd.TestNewsGroups --input ${WORK_DIR}/20news-bydate/20news-bydate-test/ --model /tmp/news-group.model
+elif [ "x$alg" == "xclean" ]; then
+ rm -rf $WORK_DIR
+ rm -rf /tmp/news-group.model
+ $DFSRM $WORK_DIR
+fi
+# Remove the work directory
+#

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/bin/classify-wikipedia.sh
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/bin/classify-wikipedia.sh b/community/mahout-mr/mr-examples/bin/classify-wikipedia.sh
new file mode 100755
index 0000000..41dc0c9
--- /dev/null
+++ b/community/mahout-mr/mr-examples/bin/classify-wikipedia.sh
@@ -0,0 +1,196 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#
+# Downloads a (partial) wikipedia dump, trains and tests a classifier.
+#
+# To run: change into the mahout directory and type:
+# examples/bin/classify-wikipedia.sh
+
+if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
+ echo "This script Bayes and CBayes classifiers over the last wikipedia dump."
+ exit
+fi
+
+# ensure that MAHOUT_HOME is set
+if [[ -z "$MAHOUT_HOME" ]]; then
+ echo "Please set MAHOUT_HOME."
+ exit
+fi
+
+SCRIPT_PATH=${0%/*}
+if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then
+ cd $SCRIPT_PATH
+fi
+START_PATH=`pwd`
+
+# Set commands for dfs
+source ${START_PATH}/set-dfs-commands.sh
+
+if [[ -z "$MAHOUT_WORK_DIR" ]]; then
+ WORK_DIR=/tmp/mahout-work-wiki
+else
+ WORK_DIR=$MAHOUT_WORK_DIR
+fi
+algorithm=( CBayes BinaryCBayes clean)
+if [ -n "$1" ]; then
+ choice=$1
+else
+ echo "Please select a number to choose the corresponding task to run"
+ echo "1. ${algorithm[0]} (may require increased heap space on yarn)"
+ echo "2. ${algorithm[1]}"
+ echo "3. ${algorithm[2]} -- cleans up the work area in $WORK_DIR"
+ read -p "Enter your choice : " choice
+fi
+
+echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]}"
+alg=${algorithm[$choice-1]}
+
+if [ "x$alg" != "xclean" ]; then
+ echo "creating work directory at ${WORK_DIR}"
+
+ mkdir -p ${WORK_DIR}
+ if [ ! -e ${WORK_DIR}/wikixml ]; then
+ mkdir -p ${WORK_DIR}/wikixml
+ fi
+ if [ ! -e ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2 ]; then
+ echo "Downloading wikipedia XML dump"
+ ########################################################
+ # Datasets: uncomment and run "clean" to change dataset
+ ########################################################
+ ########## partial small 42.5M zipped
+ # curl https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles1.xml-p000000010p000030302.bz2 -o ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2
+ ########## partial larger 256M zipped
+ curl https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles10.xml-p2336425p3046511.bz2 -o ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2
+ ######### full wikipedia dump: 10G zipped
+ # curl https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2 -o ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2
+ ########################################################
+ fi
+ if [ ! -e ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml ]; then
+ echo "Extracting..."
+
+ cd ${WORK_DIR}/wikixml && bunzip2 enwiki-latest-pages-articles.xml.bz2 && cd .. && cd ..
+ fi
+
+echo $START_PATH
+
+set -e
+
+if [ "x$alg" == "xCBayes" ] || [ "x$alg" == "xBinaryCBayes" ] ; then
+
+ set -x
+ echo "Preparing wikipedia data"
+ rm -rf ${WORK_DIR}/wiki
+ mkdir ${WORK_DIR}/wiki
+
+ if [ "x$alg" == "xCBayes" ] ; then
+ # use a list of 10 countries as categories
+ cp $MAHOUT_HOME/examples/bin/resources/country10.txt ${WORK_DIR}/country.txt
+ chmod 666 ${WORK_DIR}/country.txt
+ fi
+
+ if [ "x$alg" == "xBinaryCBayes" ] ; then
+ # use United States and United Kingdom as categories
+ cp $MAHOUT_HOME/examples/bin/resources/country2.txt ${WORK_DIR}/country.txt
+ chmod 666 ${WORK_DIR}/country.txt
+ fi
+
+ if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
+ echo "Copying wikipedia data to HDFS"
+ set +e
+ $DFSRM ${WORK_DIR}/wikixml
+ $DFS -mkdir -p ${WORK_DIR}
+ set -e
+ $DFS -put ${WORK_DIR}/wikixml ${WORK_DIR}/wikixml
+ fi
+
+ echo "Creating sequence files from wikiXML"
+ $MAHOUT_HOME/bin/mahout seqwiki -c ${WORK_DIR}/country.txt \
+ -i ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml \
+ -o ${WORK_DIR}/wikipediainput
+
+ # if using the 10 class problem use bigrams
+ if [ "x$alg" == "xCBayes" ] ; then
+ echo "Converting sequence files to vectors using bigrams"
+ $MAHOUT_HOME/bin/mahout seq2sparse -i ${WORK_DIR}/wikipediainput \
+ -o ${WORK_DIR}/wikipediaVecs \
+ -wt tfidf \
+ -lnorm -nv \
+ -ow -ng 2
+ fi
+
+ # if using the 2 class problem try different options
+ if [ "x$alg" == "xBinaryCBayes" ] ; then
+ echo "Converting sequence files to vectors using unigrams and a max document frequency of 30%"
+ $MAHOUT_HOME/bin/mahout seq2sparse -i ${WORK_DIR}/wikipediainput \
+ -o ${WORK_DIR}/wikipediaVecs \
+ -wt tfidf \
+ -lnorm \
+ -nv \
+ -ow \
+ -ng 1 \
+ -x 30
+ fi
+
+ echo "Creating training and holdout set with a random 80-20 split of the generated vector dataset"
+ $MAHOUT_HOME/bin/mahout split -i ${WORK_DIR}/wikipediaVecs/tfidf-vectors/ \
+ --trainingOutput ${WORK_DIR}/training \
+ --testOutput ${WORK_DIR}/testing \
+ -rp 20 \
+ -ow \
+ -seq \
+ -xm sequential
+
+ echo "Training Naive Bayes model"
+ $MAHOUT_HOME/bin/mahout trainnb -i ${WORK_DIR}/training \
+ -o ${WORK_DIR}/model \
+ -li ${WORK_DIR}/labelindex \
+ -ow \
+ -c
+
+ echo "Self testing on training set"
+ $MAHOUT_HOME/bin/mahout testnb -i ${WORK_DIR}/training \
+ -m ${WORK_DIR}/model \
+ -l ${WORK_DIR}/labelindex \
+ -ow \
+ -o ${WORK_DIR}/output \
+ -c
+
+ echo "Testing on holdout set: Bayes"
+ $MAHOUT_HOME/bin/mahout testnb -i ${WORK_DIR}/testing \
+ -m ${WORK_DIR}/model \
+ -l ${WORK_DIR}/labelindex \
+ -ow \
+ -o ${WORK_DIR}/output \
+ -seq
+
+ echo "Testing on holdout set: CBayes"
+ $MAHOUT_HOME/bin/mahout testnb -i ${WORK_DIR}/testing \
+ -m ${WORK_DIR}/model -l \
+ ${WORK_DIR}/labelindex \
+ -ow \
+ -o ${WORK_DIR}/output \
+ -c \
+ -seq
+fi
+
+elif [ "x$alg" == "xclean" ]; then
+ rm -rf $WORK_DIR
+ $DFSRM $WORK_DIR
+fi
+# Remove the work directory

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/bin/cluster-reuters.sh
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/bin/cluster-reuters.sh b/community/mahout-mr/mr-examples/bin/cluster-reuters.sh
new file mode 100755
index 0000000..49f6c94
--- /dev/null
+++ b/community/mahout-mr/mr-examples/bin/cluster-reuters.sh
@@ -0,0 +1,203 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#
+# Downloads the Reuters dataset and prepares it for clustering
+#
+# To run: change into the mahout directory and type:
+# examples/bin/cluster-reuters.sh
+
+if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
+ echo "This script clusters the Reuters data set using a variety of algorithms. The data set is downloaded automatically."
+ exit
+fi
+
+SCRIPT_PATH=${0%/*}
+if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then
+ cd $SCRIPT_PATH
+fi
+START_PATH=`pwd`
+
+# Set commands for dfs
+source ${START_PATH}/set-dfs-commands.sh
+
+MAHOUT="../../bin/mahout"
+
+if [ ! -e $MAHOUT ]; then
+ echo "Can't find mahout driver in $MAHOUT, cwd `pwd`, exiting.."
+ exit 1
+fi
+
+if [[ -z "$MAHOUT_WORK_DIR" ]]; then
+ WORK_DIR=/tmp/mahout-work-${USER}
+else
+ WORK_DIR=$MAHOUT_WORK_DIR
+fi
+
+algorithm=( kmeans fuzzykmeans lda streamingkmeans clean)
+if [ -n "$1" ]; then
+ choice=$1
+else
+ echo "Please select a number to choose the corresponding clustering algorithm"
+ echo "1. ${algorithm[0]} clustering (runs from this example script in cluster mode only)"
+ echo "2. ${algorithm[1]} clustering (may require increased heap space on yarn)"
+ echo "3. ${algorithm[2]} clustering"
+ echo "4. ${algorithm[3]} clustering"
+ echo "5. ${algorithm[4]} -- cleans up the work area in $WORK_DIR"
+ read -p "Enter your choice : " choice
+fi
+
+echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]} Clustering"
+clustertype=${algorithm[$choice-1]}
+
+if [ "x$clustertype" == "xclean" ]; then
+ rm -rf $WORK_DIR
+ $DFSRM $WORK_DIR
+ exit 1
+else
+ $DFS -mkdir -p $WORK_DIR
+ mkdir -p $WORK_DIR
+ echo "Creating work directory at ${WORK_DIR}"
+fi
+if [ ! -e ${WORK_DIR}/reuters-out-seqdir ]; then
+ if [ ! -e ${WORK_DIR}/reuters-out ]; then
+ if [ ! -e ${WORK_DIR}/reuters-sgm ]; then
+ if [ ! -f ${WORK_DIR}/reuters21578.tar.gz ]; then
+ if [ -n "$2" ]; then
+ echo "Copying Reuters from local download"
+ cp $2 ${WORK_DIR}/reuters21578.tar.gz
+ else
+ echo "Downloading Reuters-21578"
+ curl http://kdd.ics.uci.edu/databases/reuters21578/reuters21578.tar.gz -o ${WORK_DIR}/reuters21578.tar.gz
+ fi
+ fi
+ #make sure it was actually downloaded
+ if [ ! -f ${WORK_DIR}/reuters21578.tar.gz ]; then
+ echo "Failed to download reuters"
+ exit 1
+ fi
+ mkdir -p ${WORK_DIR}/reuters-sgm
+ echo "Extracting..."
+ tar xzf ${WORK_DIR}/reuters21578.tar.gz -C ${WORK_DIR}/reuters-sgm
+ fi
+ echo "Extracting Reuters"
+ $MAHOUT org.apache.lucene.benchmark.utils.ExtractReuters ${WORK_DIR}/reuters-sgm ${WORK_DIR}/reuters-out
+ if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
+ echo "Copying Reuters data to Hadoop"
+ set +e
+ $DFSRM ${WORK_DIR}/reuters-sgm
+ $DFSRM ${WORK_DIR}/reuters-out
+ $DFS -mkdir -p ${WORK_DIR}/
+ $DFS -mkdir ${WORK_DIR}/reuters-sgm
+ $DFS -mkdir ${WORK_DIR}/reuters-out
+ $DFS -put ${WORK_DIR}/reuters-sgm ${WORK_DIR}/reuters-sgm
+ $DFS -put ${WORK_DIR}/reuters-out ${WORK_DIR}/reuters-out
+ set -e
+ fi
+ fi
+ echo "Converting to Sequence Files from Directory"
+ $MAHOUT seqdirectory -i ${WORK_DIR}/reuters-out -o ${WORK_DIR}/reuters-out-seqdir -c UTF-8 -chunk 64 -xm sequential
+fi
+
+if [ "x$clustertype" == "xkmeans" ]; then
+ $MAHOUT seq2sparse \
+ -i ${WORK_DIR}/reuters-out-seqdir/ \
+ -o ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans --maxDFPercent 85 --namedVector \
+ && \
+ $MAHOUT kmeans \
+ -i ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans/tfidf-vectors/ \
+ -c ${WORK_DIR}/reuters-kmeans-clusters \
+ -o ${WORK_DIR}/reuters-kmeans \
+ -dm org.apache.mahout.common.distance.EuclideanDistanceMeasure \
+ -x 10 -k 20 -ow --clustering \
+ && \
+ $MAHOUT clusterdump \
+ -i `$DFS -ls -d ${WORK_DIR}/reuters-kmeans/clusters-*-final | awk '{print $8}'` \
+ -o ${WORK_DIR}/reuters-kmeans/clusterdump \
+ -d ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans/dictionary.file-0 \
+ -dt sequencefile -b 100 -n 20 --evaluate -dm org.apache.mahout.common.distance.EuclideanDistanceMeasure -sp 0 \
+ --pointsDir ${WORK_DIR}/reuters-kmeans/clusteredPoints \
+ && \
+ cat ${WORK_DIR}/reuters-kmeans/clusterdump
+elif [ "x$clustertype" == "xfuzzykmeans" ]; then
+ $MAHOUT seq2sparse \
+ -i ${WORK_DIR}/reuters-out-seqdir/ \
+ -o ${WORK_DIR}/reuters-out-seqdir-sparse-fkmeans --maxDFPercent 85 --namedVector \
+ && \
+ $MAHOUT fkmeans \
+ -i ${WORK_DIR}/reuters-out-seqdir-sparse-fkmeans/tfidf-vectors/ \
+ -c ${WORK_DIR}/reuters-fkmeans-clusters \
+ -o ${WORK_DIR}/reuters-fkmeans \
+ -dm org.apache.mahout.common.distance.EuclideanDistanceMeasure \
+ -x 10 -k 20 -ow -m 1.1 \
+ && \
+ $MAHOUT clusterdump \
+ -i ${WORK_DIR}/reuters-fkmeans/clusters-*-final \
+ -o ${WORK_DIR}/reuters-fkmeans/clusterdump \
+ -d ${WORK_DIR}/reuters-out-seqdir-sparse-fkmeans/dictionary.file-0 \
+ -dt sequencefile -b 100 -n 20 -sp 0 \
+ && \
+ cat ${WORK_DIR}/reuters-fkmeans/clusterdump
+elif [ "x$clustertype" == "xlda" ]; then
+ $MAHOUT seq2sparse \
+ -i ${WORK_DIR}/reuters-out-seqdir/ \
+ -o ${WORK_DIR}/reuters-out-seqdir-sparse-lda -ow --maxDFPercent 85 --namedVector \
+ && \
+ $MAHOUT rowid \
+ -i ${WORK_DIR}/reuters-out-seqdir-sparse-lda/tfidf-vectors \
+ -o ${WORK_DIR}/reuters-out-matrix \
+ && \
+ rm -rf ${WORK_DIR}/reuters-lda ${WORK_DIR}/reuters-lda-topics ${WORK_DIR}/reuters-lda-model \
+ && \
+ $MAHOUT cvb \
+ -i ${WORK_DIR}/reuters-out-matrix/matrix \
+ -o ${WORK_DIR}/reuters-lda -k 20 -ow -x 20 \
+ -dict ${WORK_DIR}/reuters-out-seqdir-sparse-lda/dictionary.file-* \
+ -dt ${WORK_DIR}/reuters-lda-topics \
+ -mt ${WORK_DIR}/reuters-lda-model \
+ && \
+ $MAHOUT vectordump \
+ -i ${WORK_DIR}/reuters-lda-topics/part-m-00000 \
+ -o ${WORK_DIR}/reuters-lda/vectordump \
+ -vs 10 -p true \
+ -d ${WORK_DIR}/reuters-out-seqdir-sparse-lda/dictionary.file-* \
+ -dt sequencefile -sort ${WORK_DIR}/reuters-lda-topics/part-m-00000 \
+ && \
+ cat ${WORK_DIR}/reuters-lda/vectordump
+elif [ "x$clustertype" == "xstreamingkmeans" ]; then
+ $MAHOUT seq2sparse \
+ -i ${WORK_DIR}/reuters-out-seqdir/ \
+ -o ${WORK_DIR}/reuters-out-seqdir-sparse-streamingkmeans -ow --maxDFPercent 85 --namedVector \
+ && \
+ rm -rf ${WORK_DIR}/reuters-streamingkmeans \
+ && \
+ $MAHOUT streamingkmeans \
+ -i ${WORK_DIR}/reuters-out-seqdir-sparse-streamingkmeans/tfidf-vectors/ \
+ --tempDir ${WORK_DIR}/tmp \
+ -o ${WORK_DIR}/reuters-streamingkmeans \
+ -sc org.apache.mahout.math.neighborhood.FastProjectionSearch \
+ -dm org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure \
+ -k 10 -km 100 -ow \
+ && \
+ $MAHOUT qualcluster \
+ -i ${WORK_DIR}/reuters-out-seqdir-sparse-streamingkmeans/tfidf-vectors/part-r-00000 \
+ -c ${WORK_DIR}/reuters-streamingkmeans/part-r-00000 \
+ -o ${WORK_DIR}/reuters-cluster-distance.csv \
+ && \
+ cat ${WORK_DIR}/reuters-cluster-distance.csv
+fi

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/bin/cluster-syntheticcontrol.sh
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/bin/cluster-syntheticcontrol.sh b/community/mahout-mr/mr-examples/bin/cluster-syntheticcontrol.sh
new file mode 100755
index 0000000..796da33
--- /dev/null
+++ b/community/mahout-mr/mr-examples/bin/cluster-syntheticcontrol.sh
@@ -0,0 +1,105 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#
+# Downloads the Synthetic control dataset and prepares it for clustering
+#
+# To run: change into the mahout directory and type:
+# examples/bin/cluster-syntheticcontrol.sh
+
+if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
+ echo "This script clusters the Synthetic Control data set. The data set is downloaded automatically."
+ exit
+fi
+
+algorithm=( kmeans fuzzykmeans )
+if [ -n "$1" ]; then
+ choice=$1
+else
+ echo "Please select a number to choose the corresponding clustering algorithm"
+ echo "1. ${algorithm[0]} clustering"
+ echo "2. ${algorithm[1]} clustering"
+ read -p "Enter your choice : " choice
+fi
+echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]} Clustering"
+clustertype=${algorithm[$choice-1]}
+
+SCRIPT_PATH=${0%/*}
+if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then
+ cd $SCRIPT_PATH
+fi
+START_PATH=`pwd`
+
+# Set commands for dfs
+source ${START_PATH}/set-dfs-commands.sh
+
+if [[ -z "$MAHOUT_WORK_DIR" ]]; then
+ WORK_DIR=/tmp/mahout-work-${USER}
+else
+ WORK_DIR=$MAHOUT_WORK_DIR
+fi
+
+echo "creating work directory at ${WORK_DIR}"
+mkdir -p ${WORK_DIR}
+if [ ! -f ${WORK_DIR}/synthetic_control.data ]; then
+ if [ -n "$2" ]; then
+ cp $2 ${WORK_DIR}/.
+ else
+ echo "Downloading Synthetic control data"
+ curl http://archive.ics.uci.edu/ml/databases/synthetic_control/synthetic_control.data -o ${WORK_DIR}/synthetic_control.data
+ fi
+fi
+if [ ! -f ${WORK_DIR}/synthetic_control.data ]; then
+ echo "Couldn't download synthetic control"
+ exit 1
+fi
+if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ]; then
+ echo "Checking the health of DFS..."
+ $DFS -ls /
+ if [ $? -eq 0 ];then
+ echo "DFS is healthy... "
+ echo "Uploading Synthetic control data to HDFS"
+ $DFSRM ${WORK_DIR}/testdata
+ $DFS -mkdir -p ${WORK_DIR}/testdata
+ $DFS -put ${WORK_DIR}/synthetic_control.data ${WORK_DIR}/testdata
+ echo "Successfully Uploaded Synthetic control data to HDFS "
+
+ options="--input ${WORK_DIR}/testdata --output ${WORK_DIR}/output --maxIter 10 --convergenceDelta 0.5"
+
+ if [ "${clustertype}" == "kmeans" ]; then
+ options="${options} --numClusters 6"
+ # t1 & t2 not used if --numClusters specified, but parser requires input
+ options="${options} --t1 1 --t2 2"
+ ../../bin/mahout.bu org.apache.mahout.clustering.syntheticcontrol."${clustertype}".Job ${options}
+ else
+ options="${options} --m 2.0f --t1 80 --t2 55"
+ ../../bin/mahout.bu org.apache.mahout.clustering.syntheticcontrol."${clustertype}".Job ${options}
+ fi
+ else
+ echo " HADOOP is not running. Please make sure you hadoop is running. "
+ fi
+elif [ "$MAHOUT_LOCAL" != "" ]; then
+ echo "running MAHOUT_LOCAL"
+ cp ${WORK_DIR}/synthetic_control.data testdata
+ ../../bin/mahout.bu org.apache.mahout.clustering.syntheticcontrol."${clustertype}".Job
+ rm testdata
+else
+ echo " HADOOP_HOME variable is not set. Please set this environment variable and rerun the script"
+fi
+# Remove the work directory
+rm -rf ${WORK_DIR}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/bin/factorize-movielens-1M.sh
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/bin/factorize-movielens-1M.sh b/community/mahout-mr/mr-examples/bin/factorize-movielens-1M.sh
new file mode 100755
index 0000000..29730e1
--- /dev/null
+++ b/community/mahout-mr/mr-examples/bin/factorize-movielens-1M.sh
@@ -0,0 +1,85 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Instructions:
+#
+# Before using this script, you have to download and extract the Movielens 1M dataset
+# from http://www.grouplens.org/node/73
+#
+# To run: change into the mahout directory and type:
+# export MAHOUT_LOCAL=true
+# Then:
+# examples/bin/factorize-movielens-1M.sh /path/to/ratings.dat
+
+if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
+ echo "This script runs the Alternating Least Squares Recommender on the Grouplens data set (size 1M)."
+ echo "Syntax: $0 /path/to/ratings.dat\n"
+ exit
+fi
+
+if [ $# -ne 1 ]
+then
+ echo -e "\nYou have to download the Movielens 1M dataset from http://www.grouplens.org/node/73 before"
+ echo -e "you can run this example. After that extract it and supply the path to the ratings.dat file.\n"
+ echo -e "Syntax: $0 /path/to/ratings.dat\n"
+ exit -1
+fi
+
+export MAHOUT_LOCAL=true
+MAHOUT="$MAHOUT_HOME/bin/mahout"
+
+if [[ -z "$MAHOUT_WORK_DIR" ]]; then
+ WORK_DIR=/tmp/mahout-work-${USER}
+else
+ WORK_DIR=$MAHOUT_WORK_DIR
+fi
+
+echo "creating work directory at ${WORK_DIR}"
+mkdir -p ${WORK_DIR}/movielens
+
+echo "Converting ratings..."
+cat $1 |sed -e s/::/,/g| cut -d, -f1,2,3 > ${WORK_DIR}/movielens/ratings.csv
+
+# create a 90% percent training set and a 10% probe set
+$MAHOUT splitDataset --input ${WORK_DIR}/movielens/ratings.csv --output ${WORK_DIR}/dataset \
+ --trainingPercentage 0.9 --probePercentage 0.1 --tempDir ${WORK_DIR}/dataset/tmp
+
+# run distributed ALS-WR to factorize the rating matrix defined by the training set
+$MAHOUT parallelALS --input ${WORK_DIR}/dataset/trainingSet/ --output ${WORK_DIR}/als/out \
+ --tempDir ${WORK_DIR}/als/tmp --numFeatures 20 --numIterations 10 --lambda 0.065 --numThreadsPerSolver 2
+
+# compute predictions against the probe set, measure the error
+$MAHOUT evaluateFactorization --input ${WORK_DIR}/dataset/probeSet/ --output ${WORK_DIR}/als/rmse/ \
+ --userFeatures ${WORK_DIR}/als/out/U/ --itemFeatures ${WORK_DIR}/als/out/M/ --tempDir ${WORK_DIR}/als/tmp
+
+# compute recommendations
+$MAHOUT recommendfactorized --input ${WORK_DIR}/als/out/userRatings/ --output ${WORK_DIR}/recommendations/ \
+ --userFeatures ${WORK_DIR}/als/out/U/ --itemFeatures ${WORK_DIR}/als/out/M/ \
+ --numRecommendations 6 --maxRating 5 --numThreads 2
+
+# print the error
+echo -e "\nRMSE is:\n"
+cat ${WORK_DIR}/als/rmse/rmse.txt
+echo -e "\n"
+
+echo -e "\nSample recommendations:\n"
+shuf ${WORK_DIR}/recommendations/part-m-00000 |head
+echo -e "\n\n"
+
+echo "removing work directory"
+rm -rf ${WORK_DIR}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/bin/factorize-netflix.sh
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/bin/factorize-netflix.sh b/community/mahout-mr/mr-examples/bin/factorize-netflix.sh
new file mode 100755
index 0000000..26faf66
--- /dev/null
+++ b/community/mahout-mr/mr-examples/bin/factorize-netflix.sh
@@ -0,0 +1,90 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Instructions:
+#
+# You can only use this script in conjunction with the Netflix dataset. Unpack the Netflix dataset and provide the
+# following:
+#
+# 1) the path to the folder 'training_set' that contains all the movie rating files
+# 2) the path to the file 'qualifying.txt' that contains the user,item pairs to predict
+# 3) the path to the file 'judging.txt' that contains the ratings of user,item pairs to predict for
+#
+# To run:
+# ./factorize-netflix.sh /path/to/training_set/ /path/to/qualifying.txt /path/to/judging.txt
+
+echo "Note this script has been deprecated due to the lack of access to the Netflix data set."
+exit 1
+
+if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
+ echo "This script runs the ALS Recommender on the Netflix data set."
+ echo "Syntax: $0 /path/to/training_set/ /path/to/qualifying.txt /path/to/judging.txt\n"
+ exit
+fi
+
+if [ $# -ne 3 ]
+then
+ echo -e "Syntax: $0 /path/to/training_set/ /path/to/qualifying.txt /path/to/judging.txt\n"
+ exit -1
+fi
+
+MAHOUT="../../bin/mahout"
+
+if [[ -z "$MAHOUT_WORK_DIR" ]]; then
+ WORK_DIR=/tmp/mahout-work-${USER}
+else
+ WORK_DIR=$MAHOUT_WORK_DIR
+fi
+
+START_PATH=`pwd`
+
+# Set commands for dfs
+source ${START_PATH}/set-dfs-commands.sh
+
+echo "Preparing data..."
+$MAHOUT org.apache.mahout.cf.taste.hadoop.example.als.netflix.NetflixDatasetConverter $1 $2 $3 ${WORK_DIR}
+
+# run distributed ALS-WR to factorize the rating matrix defined by the training set
+$MAHOUT parallelALS --input ${WORK_DIR}/trainingSet/ratings.tsv --output ${WORK_DIR}/als/out \
+ --tempDir ${WORK_DIR}/als/tmp --numFeatures 25 --numIterations 10 --lambda 0.065 --numThreadsPerSolver 4
+
+# compute predictions against the probe set, measure the error
+$MAHOUT evaluateFactorization --input ${WORK_DIR}/probeSet/ratings.tsv --output ${WORK_DIR}/als/rmse/ \
+ --userFeatures ${WORK_DIR}/als/out/U/ --itemFeatures ${WORK_DIR}/als/out/M/ --tempDir ${WORK_DIR}/als/tmp
+
+if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
+
+ # print the error, should be around 0.923
+ echo -e "\nRMSE is:\n"
+ $DFS -tail ${WORK_DIR}/als/rmse/rmse.txt
+ echo -e "\n"
+ echo "removing work directory"
+ set +e
+ $DFSRM ${WORK_DIR}
+
+else
+
+ # print the error, should be around 0.923
+ echo -e "\nRMSE is:\n"
+ cat ${WORK_DIR}/als/rmse/rmse.txt
+ echo -e "\n"
+ echo "removing work directory"
+ rm -rf ${WORK_DIR}
+
+fi
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/bin/get-all-examples.sh
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/bin/get-all-examples.sh b/community/mahout-mr/mr-examples/bin/get-all-examples.sh
new file mode 100755
index 0000000..4128e47
--- /dev/null
+++ b/community/mahout-mr/mr-examples/bin/get-all-examples.sh
@@ -0,0 +1,36 @@
+#!/usr/bin/env bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Clones Mahout example code from remote repositories with their own
+# build process. Follow the README for each example for instructions.
+#
+# Usage: change into the mahout directory and type:
+# examples/bin/get-all-examples.sh
+
+# Solr-recommender
+echo " Solr-recommender example: "
+echo " 1) imports text 'log files' of some delimited form for user preferences"
+echo " 2) creates the correct Mahout files and stores distionaries to translate external Id to and from Mahout Ids"
+echo " 3) it implements a prototype two actions 'cross-recommender', which takes two actions made by the same user and creates recommendations"
+echo " 4) it creates output for user->preference history CSV and and item->similar items 'similarity' matrix for use in a Solr-recommender."
+echo " To use Solr you would index the similarity matrix CSV, and use user preference history from the history CSV as a query, the result"
+echo " from Solr will be an ordered list of recommendations returning the same item Ids as were input."
+echo " For further description see the README.md here https://github.com/pferrel/solr-recommender"
+echo " To build run 'cd solr-recommender; mvn install'"
+echo " To process the example after building make sure MAHOUT_LOCAL IS SET and hadoop is in local mode then "
+echo " run 'cd scripts; ./solr-recommender-example'"
+git clone https://github.com/pferrel/solr-recommender

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/bin/lda.algorithm
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/bin/lda.algorithm b/community/mahout-mr/mr-examples/bin/lda.algorithm
new file mode 100644
index 0000000..fb84ea0
--- /dev/null
+++ b/community/mahout-mr/mr-examples/bin/lda.algorithm
@@ -0,0 +1,45 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+merge.policy=org.apache.lucene.index.LogDocMergePolicy
+merge.factor=mrg:10:20
+max.buffered=buf:100:1000
+compound=true
+
+analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer
+directory=FSDirectory
+
+doc.stored=true
+doc.term.vector=true
+doc.tokenized=true
+log.step=600
+
+content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
+content.source.forever=false
+doc.maker.forever=false
+query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker
+
+# task at this depth or less would print when they start
+task.max.depth.log=2
+
+log.queries=false
+# --------- alg
+{ "BuildReuters"
+ CreateIndex
+ { "AddDocs" AddDoc > : *
+# Optimize
+ CloseIndex
+}
+
r***@apache.org
2018-06-28 14:55:06 UTC
Permalink
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/bin/resources/bank-full.csv
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/bin/resources/bank-full.csv b/community/mahout-mr/mr-examples/bin/resources/bank-full.csv
new file mode 100644
index 0000000..d7a2ede
--- /dev/null
+++ b/community/mahout-mr/mr-examples/bin/resources/bank-full.csv
@@ -0,0 +1,45212 @@
+"age";"job";"marital";"education";"default";"balance";"housing";"loan";"contact";"day";"month";"duration";"campaign";"pdays";"previous";"poutcome";"y"
+58;"management";"married";"tertiary";"no";2143;"yes";"no";"unknown";5;"may";261;1;-1;0;"unknown";"no"
+44;"technician";"single";"secondary";"no";29;"yes";"no";"unknown";5;"may";151;1;-1;0;"unknown";"no"
+33;"entrepreneur";"married";"secondary";"no";2;"yes";"yes";"unknown";5;"may";76;1;-1;0;"unknown";"no"
+47;"blue-collar";"married";"unknown";"no";1506;"yes";"no";"unknown";5;"may";92;1;-1;0;"unknown";"no"
+33;"unknown";"single";"unknown";"no";1;"no";"no";"unknown";5;"may";198;1;-1;0;"unknown";"no"
+35;"management";"married";"tertiary";"no";231;"yes";"no";"unknown";5;"may";139;1;-1;0;"unknown";"no"
+28;"management";"single";"tertiary";"no";447;"yes";"yes";"unknown";5;"may";217;1;-1;0;"unknown";"no"
+42;"entrepreneur";"divorced";"tertiary";"yes";2;"yes";"no";"unknown";5;"may";380;1;-1;0;"unknown";"no"
+58;"retired";"married";"primary";"no";121;"yes";"no";"unknown";5;"may";50;1;-1;0;"unknown";"no"
+43;"technician";"single";"secondary";"no";593;"yes";"no";"unknown";5;"may";55;1;-1;0;"unknown";"no"
+41;"admin.";"divorced";"secondary";"no";270;"yes";"no";"unknown";5;"may";222;1;-1;0;"unknown";"no"
+29;"admin.";"single";"secondary";"no";390;"yes";"no";"unknown";5;"may";137;1;-1;0;"unknown";"no"
+53;"technician";"married";"secondary";"no";6;"yes";"no";"unknown";5;"may";517;1;-1;0;"unknown";"no"
+58;"technician";"married";"unknown";"no";71;"yes";"no";"unknown";5;"may";71;1;-1;0;"unknown";"no"
+57;"services";"married";"secondary";"no";162;"yes";"no";"unknown";5;"may";174;1;-1;0;"unknown";"no"
+51;"retired";"married";"primary";"no";229;"yes";"no";"unknown";5;"may";353;1;-1;0;"unknown";"no"
+45;"admin.";"single";"unknown";"no";13;"yes";"no";"unknown";5;"may";98;1;-1;0;"unknown";"no"
+57;"blue-collar";"married";"primary";"no";52;"yes";"no";"unknown";5;"may";38;1;-1;0;"unknown";"no"
+60;"retired";"married";"primary";"no";60;"yes";"no";"unknown";5;"may";219;1;-1;0;"unknown";"no"
+33;"services";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";54;1;-1;0;"unknown";"no"
+28;"blue-collar";"married";"secondary";"no";723;"yes";"yes";"unknown";5;"may";262;1;-1;0;"unknown";"no"
+56;"management";"married";"tertiary";"no";779;"yes";"no";"unknown";5;"may";164;1;-1;0;"unknown";"no"
+32;"blue-collar";"single";"primary";"no";23;"yes";"yes";"unknown";5;"may";160;1;-1;0;"unknown";"no"
+25;"services";"married";"secondary";"no";50;"yes";"no";"unknown";5;"may";342;1;-1;0;"unknown";"no"
+40;"retired";"married";"primary";"no";0;"yes";"yes";"unknown";5;"may";181;1;-1;0;"unknown";"no"
+44;"admin.";"married";"secondary";"no";-372;"yes";"no";"unknown";5;"may";172;1;-1;0;"unknown";"no"
+39;"management";"single";"tertiary";"no";255;"yes";"no";"unknown";5;"may";296;1;-1;0;"unknown";"no"
+52;"entrepreneur";"married";"secondary";"no";113;"yes";"yes";"unknown";5;"may";127;1;-1;0;"unknown";"no"
+46;"management";"single";"secondary";"no";-246;"yes";"no";"unknown";5;"may";255;2;-1;0;"unknown";"no"
+36;"technician";"single";"secondary";"no";265;"yes";"yes";"unknown";5;"may";348;1;-1;0;"unknown";"no"
+57;"technician";"married";"secondary";"no";839;"no";"yes";"unknown";5;"may";225;1;-1;0;"unknown";"no"
+49;"management";"married";"tertiary";"no";378;"yes";"no";"unknown";5;"may";230;1;-1;0;"unknown";"no"
+60;"admin.";"married";"secondary";"no";39;"yes";"yes";"unknown";5;"may";208;1;-1;0;"unknown";"no"
+59;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";226;1;-1;0;"unknown";"no"
+51;"management";"married";"tertiary";"no";10635;"yes";"no";"unknown";5;"may";336;1;-1;0;"unknown";"no"
+57;"technician";"divorced";"secondary";"no";63;"yes";"no";"unknown";5;"may";242;1;-1;0;"unknown";"no"
+25;"blue-collar";"married";"secondary";"no";-7;"yes";"no";"unknown";5;"may";365;1;-1;0;"unknown";"no"
+53;"technician";"married";"secondary";"no";-3;"no";"no";"unknown";5;"may";1666;1;-1;0;"unknown";"no"
+36;"admin.";"divorced";"secondary";"no";506;"yes";"no";"unknown";5;"may";577;1;-1;0;"unknown";"no"
+37;"admin.";"single";"secondary";"no";0;"yes";"no";"unknown";5;"may";137;1;-1;0;"unknown";"no"
+44;"services";"divorced";"secondary";"no";2586;"yes";"no";"unknown";5;"may";160;1;-1;0;"unknown";"no"
+50;"management";"married";"secondary";"no";49;"yes";"no";"unknown";5;"may";180;2;-1;0;"unknown";"no"
+60;"blue-collar";"married";"unknown";"no";104;"yes";"no";"unknown";5;"may";22;1;-1;0;"unknown";"no"
+54;"retired";"married";"secondary";"no";529;"yes";"no";"unknown";5;"may";1492;1;-1;0;"unknown";"no"
+58;"retired";"married";"unknown";"no";96;"yes";"no";"unknown";5;"may";616;1;-1;0;"unknown";"no"
+36;"admin.";"single";"primary";"no";-171;"yes";"no";"unknown";5;"may";242;1;-1;0;"unknown";"no"
+58;"self-employed";"married";"tertiary";"no";-364;"yes";"no";"unknown";5;"may";355;1;-1;0;"unknown";"no"
+44;"technician";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";225;2;-1;0;"unknown";"no"
+55;"technician";"divorced";"secondary";"no";0;"no";"no";"unknown";5;"may";160;1;-1;0;"unknown";"no"
+29;"management";"single";"tertiary";"no";0;"yes";"no";"unknown";5;"may";363;1;-1;0;"unknown";"no"
+54;"blue-collar";"married";"secondary";"no";1291;"yes";"no";"unknown";5;"may";266;1;-1;0;"unknown";"no"
+48;"management";"divorced";"tertiary";"no";-244;"yes";"no";"unknown";5;"may";253;1;-1;0;"unknown";"no"
+32;"management";"married";"tertiary";"no";0;"yes";"no";"unknown";5;"may";179;1;-1;0;"unknown";"no"
+42;"admin.";"single";"secondary";"no";-76;"yes";"no";"unknown";5;"may";787;1;-1;0;"unknown";"no"
+24;"technician";"single";"secondary";"no";-103;"yes";"yes";"unknown";5;"may";145;1;-1;0;"unknown";"no"
+38;"entrepreneur";"single";"tertiary";"no";243;"no";"yes";"unknown";5;"may";174;1;-1;0;"unknown";"no"
+38;"management";"single";"tertiary";"no";424;"yes";"no";"unknown";5;"may";104;1;-1;0;"unknown";"no"
+47;"blue-collar";"married";"unknown";"no";306;"yes";"no";"unknown";5;"may";13;1;-1;0;"unknown";"no"
+40;"blue-collar";"single";"unknown";"no";24;"yes";"no";"unknown";5;"may";185;1;-1;0;"unknown";"no"
+46;"services";"married";"primary";"no";179;"yes";"no";"unknown";5;"may";1778;1;-1;0;"unknown";"no"
+32;"admin.";"married";"tertiary";"no";0;"yes";"no";"unknown";5;"may";138;1;-1;0;"unknown";"no"
+53;"technician";"divorced";"secondary";"no";989;"yes";"no";"unknown";5;"may";812;1;-1;0;"unknown";"no"
+57;"blue-collar";"married";"primary";"no";249;"yes";"no";"unknown";5;"may";164;1;-1;0;"unknown";"no"
+33;"services";"married";"secondary";"no";790;"yes";"no";"unknown";5;"may";391;1;-1;0;"unknown";"no"
+49;"blue-collar";"married";"unknown";"no";154;"yes";"no";"unknown";5;"may";357;1;-1;0;"unknown";"no"
+51;"management";"married";"tertiary";"no";6530;"yes";"no";"unknown";5;"may";91;1;-1;0;"unknown";"no"
+60;"retired";"married";"tertiary";"no";100;"no";"no";"unknown";5;"may";528;1;-1;0;"unknown";"no"
+59;"management";"divorced";"tertiary";"no";59;"yes";"no";"unknown";5;"may";273;1;-1;0;"unknown";"no"
+55;"technician";"married";"secondary";"no";1205;"yes";"no";"unknown";5;"may";158;2;-1;0;"unknown";"no"
+35;"blue-collar";"single";"secondary";"no";12223;"yes";"yes";"unknown";5;"may";177;1;-1;0;"unknown";"no"
+57;"blue-collar";"married";"secondary";"no";5935;"yes";"yes";"unknown";5;"may";258;1;-1;0;"unknown";"no"
+31;"services";"married";"secondary";"no";25;"yes";"yes";"unknown";5;"may";172;1;-1;0;"unknown";"no"
+54;"management";"married";"secondary";"no";282;"yes";"yes";"unknown";5;"may";154;1;-1;0;"unknown";"no"
+55;"blue-collar";"married";"primary";"no";23;"yes";"no";"unknown";5;"may";291;1;-1;0;"unknown";"no"
+43;"technician";"married";"secondary";"no";1937;"yes";"no";"unknown";5;"may";181;1;-1;0;"unknown";"no"
+53;"technician";"married";"secondary";"no";384;"yes";"no";"unknown";5;"may";176;1;-1;0;"unknown";"no"
+44;"blue-collar";"married";"secondary";"no";582;"no";"yes";"unknown";5;"may";211;1;-1;0;"unknown";"no"
+55;"services";"divorced";"secondary";"no";91;"no";"no";"unknown";5;"may";349;1;-1;0;"unknown";"no"
+49;"services";"divorced";"secondary";"no";0;"yes";"yes";"unknown";5;"may";272;1;-1;0;"unknown";"no"
+55;"services";"divorced";"secondary";"yes";1;"yes";"no";"unknown";5;"may";208;1;-1;0;"unknown";"no"
+45;"admin.";"single";"secondary";"no";206;"yes";"no";"unknown";5;"may";193;1;-1;0;"unknown";"no"
+47;"services";"divorced";"secondary";"no";164;"no";"no";"unknown";5;"may";212;1;-1;0;"unknown";"no"
+42;"technician";"single";"secondary";"no";690;"yes";"no";"unknown";5;"may";20;1;-1;0;"unknown";"no"
+59;"admin.";"married";"secondary";"no";2343;"yes";"no";"unknown";5;"may";1042;1;-1;0;"unknown";"yes"
+46;"self-employed";"married";"tertiary";"no";137;"yes";"yes";"unknown";5;"may";246;1;-1;0;"unknown";"no"
+51;"blue-collar";"married";"primary";"no";173;"yes";"no";"unknown";5;"may";529;2;-1;0;"unknown";"no"
+56;"admin.";"married";"secondary";"no";45;"no";"no";"unknown";5;"may";1467;1;-1;0;"unknown";"yes"
+41;"technician";"married";"secondary";"no";1270;"yes";"no";"unknown";5;"may";1389;1;-1;0;"unknown";"yes"
+46;"management";"divorced";"secondary";"no";16;"yes";"yes";"unknown";5;"may";188;2;-1;0;"unknown";"no"
+57;"retired";"married";"secondary";"no";486;"yes";"no";"unknown";5;"may";180;2;-1;0;"unknown";"no"
+42;"management";"single";"secondary";"no";50;"no";"no";"unknown";5;"may";48;1;-1;0;"unknown";"no"
+30;"technician";"married";"secondary";"no";152;"yes";"yes";"unknown";5;"may";213;2;-1;0;"unknown";"no"
+60;"admin.";"married";"secondary";"no";290;"yes";"no";"unknown";5;"may";583;1;-1;0;"unknown";"no"
+60;"blue-collar";"married";"unknown";"no";54;"yes";"no";"unknown";5;"may";221;1;-1;0;"unknown";"no"
+57;"entrepreneur";"divorced";"secondary";"no";-37;"no";"no";"unknown";5;"may";173;1;-1;0;"unknown";"no"
+36;"management";"married";"tertiary";"no";101;"yes";"yes";"unknown";5;"may";426;1;-1;0;"unknown";"no"
+55;"blue-collar";"married";"secondary";"no";383;"no";"no";"unknown";5;"may";287;1;-1;0;"unknown";"no"
+60;"retired";"married";"tertiary";"no";81;"yes";"no";"unknown";5;"may";101;1;-1;0;"unknown";"no"
+39;"technician";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";203;1;-1;0;"unknown";"no"
+46;"management";"married";"tertiary";"no";229;"yes";"no";"unknown";5;"may";197;1;-1;0;"unknown";"no"
+44;"blue-collar";"married";"secondary";"no";-674;"yes";"no";"unknown";5;"may";257;1;-1;0;"unknown";"no"
+53;"blue-collar";"married";"primary";"no";90;"no";"no";"unknown";5;"may";124;1;-1;0;"unknown";"no"
+52;"blue-collar";"married";"primary";"no";128;"yes";"no";"unknown";5;"may";229;1;-1;0;"unknown";"no"
+59;"blue-collar";"married";"primary";"no";179;"yes";"no";"unknown";5;"may";55;3;-1;0;"unknown";"no"
+27;"technician";"single";"tertiary";"no";0;"yes";"no";"unknown";5;"may";400;1;-1;0;"unknown";"no"
+44;"blue-collar";"married";"secondary";"no";54;"yes";"no";"unknown";5;"may";197;1;-1;0;"unknown";"no"
+47;"technician";"married";"tertiary";"no";151;"yes";"no";"unknown";5;"may";190;1;-1;0;"unknown";"no"
+34;"admin.";"married";"secondary";"no";61;"no";"yes";"unknown";5;"may";21;1;-1;0;"unknown";"no"
+59;"retired";"single";"secondary";"no";30;"yes";"no";"unknown";5;"may";514;1;-1;0;"unknown";"no"
+45;"management";"married";"tertiary";"no";523;"yes";"no";"unknown";5;"may";849;2;-1;0;"unknown";"no"
+29;"services";"divorced";"secondary";"no";31;"yes";"no";"unknown";5;"may";194;1;-1;0;"unknown";"no"
+46;"technician";"divorced";"secondary";"no";79;"no";"no";"unknown";5;"may";144;1;-1;0;"unknown";"no"
+56;"self-employed";"married";"primary";"no";-34;"yes";"yes";"unknown";5;"may";212;2;-1;0;"unknown";"no"
+36;"blue-collar";"married";"primary";"no";448;"yes";"no";"unknown";5;"may";286;1;-1;0;"unknown";"no"
+59;"retired";"divorced";"primary";"no";81;"yes";"no";"unknown";5;"may";107;1;-1;0;"unknown";"no"
+44;"blue-collar";"married";"secondary";"no";144;"yes";"no";"unknown";5;"may";247;2;-1;0;"unknown";"no"
+41;"admin.";"married";"secondary";"no";351;"yes";"no";"unknown";5;"may";518;1;-1;0;"unknown";"no"
+33;"management";"single";"tertiary";"no";-67;"yes";"no";"unknown";5;"may";364;1;-1;0;"unknown";"no"
+59;"management";"divorced";"tertiary";"no";262;"no";"no";"unknown";5;"may";178;1;-1;0;"unknown";"no"
+57;"technician";"married";"primary";"no";0;"no";"no";"unknown";5;"may";98;1;-1;0;"unknown";"no"
+56;"technician";"divorced";"unknown";"no";56;"yes";"no";"unknown";5;"may";439;1;-1;0;"unknown";"no"
+51;"blue-collar";"married";"secondary";"no";26;"yes";"no";"unknown";5;"may";79;1;-1;0;"unknown";"no"
+34;"admin.";"married";"unknown";"no";3;"yes";"no";"unknown";5;"may";120;3;-1;0;"unknown";"no"
+43;"services";"married";"secondary";"no";41;"yes";"yes";"unknown";5;"may";127;2;-1;0;"unknown";"no"
+52;"technician";"married";"tertiary";"no";7;"no";"yes";"unknown";5;"may";175;1;-1;0;"unknown";"no"
+33;"technician";"single";"secondary";"no";105;"yes";"no";"unknown";5;"may";262;2;-1;0;"unknown";"no"
+29;"admin.";"single";"secondary";"no";818;"yes";"yes";"unknown";5;"may";61;1;-1;0;"unknown";"no"
+34;"services";"married";"secondary";"no";-16;"yes";"yes";"unknown";5;"may";78;1;-1;0;"unknown";"no"
+31;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";143;1;-1;0;"unknown";"no"
+55;"services";"married";"secondary";"no";2476;"yes";"no";"unknown";5;"may";579;1;-1;0;"unknown";"yes"
+55;"management";"married";"unknown";"no";1185;"no";"no";"unknown";5;"may";677;1;-1;0;"unknown";"no"
+32;"admin.";"single";"secondary";"no";217;"yes";"no";"unknown";5;"may";345;1;-1;0;"unknown";"no"
+38;"technician";"single";"secondary";"no";1685;"yes";"no";"unknown";5;"may";185;1;-1;0;"unknown";"no"
+55;"admin.";"single";"secondary";"no";802;"yes";"yes";"unknown";5;"may";100;2;-1;0;"unknown";"no"
+28;"unemployed";"single";"tertiary";"no";0;"yes";"no";"unknown";5;"may";125;2;-1;0;"unknown";"no"
+23;"blue-collar";"married";"secondary";"no";94;"yes";"no";"unknown";5;"may";193;1;-1;0;"unknown";"no"
+32;"technician";"single";"secondary";"no";0;"yes";"no";"unknown";5;"may";136;1;-1;0;"unknown";"no"
+43;"services";"single";"unknown";"no";0;"no";"no";"unknown";5;"may";73;1;-1;0;"unknown";"no"
+32;"blue-collar";"married";"secondary";"no";517;"yes";"no";"unknown";5;"may";528;1;-1;0;"unknown";"no"
+46;"blue-collar";"married";"secondary";"no";265;"yes";"no";"unknown";5;"may";541;1;-1;0;"unknown";"no"
+53;"housemaid";"divorced";"primary";"no";947;"yes";"no";"unknown";5;"may";163;1;-1;0;"unknown";"no"
+34;"self-employed";"single";"secondary";"no";3;"yes";"no";"unknown";5;"may";301;1;-1;0;"unknown";"no"
+57;"unemployed";"married";"tertiary";"no";42;"no";"no";"unknown";5;"may";46;1;-1;0;"unknown";"no"
+37;"blue-collar";"married";"secondary";"no";37;"yes";"no";"unknown";5;"may";204;1;-1;0;"unknown";"no"
+59;"blue-collar";"married";"secondary";"no";57;"yes";"no";"unknown";5;"may";98;1;-1;0;"unknown";"no"
+33;"services";"married";"secondary";"no";22;"yes";"no";"unknown";5;"may";71;1;-1;0;"unknown";"no"
+56;"blue-collar";"divorced";"primary";"no";8;"yes";"no";"unknown";5;"may";157;2;-1;0;"unknown";"no"
+48;"unemployed";"married";"secondary";"no";293;"yes";"no";"unknown";5;"may";243;1;-1;0;"unknown";"no"
+43;"services";"married";"primary";"no";3;"yes";"no";"unknown";5;"may";186;2;-1;0;"unknown";"no"
+54;"blue-collar";"married";"primary";"no";348;"yes";"no";"unknown";5;"may";579;2;-1;0;"unknown";"no"
+51;"blue-collar";"married";"unknown";"no";-19;"yes";"no";"unknown";5;"may";163;2;-1;0;"unknown";"no"
+26;"student";"single";"secondary";"no";0;"yes";"no";"unknown";5;"may";610;2;-1;0;"unknown";"no"
+40;"management";"married";"tertiary";"no";-4;"yes";"no";"unknown";5;"may";2033;1;-1;0;"unknown";"no"
+39;"management";"married";"secondary";"no";18;"yes";"no";"unknown";5;"may";85;1;-1;0;"unknown";"no"
+50;"technician";"married";"primary";"no";139;"no";"no";"unknown";5;"may";114;2;-1;0;"unknown";"no"
+41;"services";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";114;2;-1;0;"unknown";"no"
+51;"blue-collar";"married";"unknown";"no";1883;"yes";"no";"unknown";5;"may";57;1;-1;0;"unknown";"no"
+60;"retired";"divorced";"secondary";"no";216;"yes";"no";"unknown";5;"may";238;1;-1;0;"unknown";"no"
+52;"blue-collar";"married";"secondary";"no";782;"yes";"no";"unknown";5;"may";93;3;-1;0;"unknown";"no"
+48;"blue-collar";"married";"secondary";"no";904;"yes";"no";"unknown";5;"may";128;2;-1;0;"unknown";"no"
+48;"services";"married";"unknown";"no";1705;"yes";"no";"unknown";5;"may";107;1;-1;0;"unknown";"no"
+39;"technician";"single";"tertiary";"no";47;"yes";"no";"unknown";5;"may";181;1;-1;0;"unknown";"no"
+47;"services";"single";"secondary";"no";176;"yes";"no";"unknown";5;"may";303;2;-1;0;"unknown";"no"
+40;"blue-collar";"married";"secondary";"no";1225;"yes";"no";"unknown";5;"may";558;5;-1;0;"unknown";"no"
+45;"technician";"married";"secondary";"no";86;"yes";"no";"unknown";5;"may";270;1;-1;0;"unknown";"no"
+26;"admin.";"single";"secondary";"no";82;"yes";"no";"unknown";5;"may";228;1;-1;0;"unknown";"no"
+52;"management";"married";"tertiary";"no";271;"yes";"no";"unknown";5;"may";99;1;-1;0;"unknown";"no"
+54;"technician";"married";"secondary";"no";1378;"yes";"no";"unknown";5;"may";240;1;-1;0;"unknown";"no"
+54;"admin.";"married";"tertiary";"no";184;"no";"no";"unknown";5;"may";673;2;-1;0;"unknown";"yes"
+50;"blue-collar";"married";"primary";"no";0;"no";"no";"unknown";5;"may";233;3;-1;0;"unknown";"no"
+35;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";1056;1;-1;0;"unknown";"no"
+44;"services";"married";"secondary";"no";1357;"yes";"yes";"unknown";5;"may";250;1;-1;0;"unknown";"no"
+53;"entrepreneur";"married";"unknown";"no";19;"yes";"no";"unknown";5;"may";252;1;-1;0;"unknown";"no"
+35;"retired";"single";"primary";"no";434;"no";"no";"unknown";5;"may";138;1;-1;0;"unknown";"no"
+60;"admin.";"divorced";"secondary";"no";92;"yes";"no";"unknown";5;"may";130;1;-1;0;"unknown";"no"
+53;"admin.";"divorced";"secondary";"no";1151;"yes";"no";"unknown";5;"may";412;1;-1;0;"unknown";"no"
+48;"unemployed";"married";"secondary";"no";41;"yes";"no";"unknown";5;"may";179;2;-1;0;"unknown";"no"
+34;"technician";"married";"secondary";"no";51;"yes";"no";"unknown";5;"may";19;2;-1;0;"unknown";"no"
+54;"services";"married";"secondary";"no";214;"yes";"no";"unknown";5;"may";458;2;-1;0;"unknown";"no"
+51;"management";"married";"secondary";"no";1161;"yes";"no";"unknown";5;"may";717;1;-1;0;"unknown";"no"
+31;"services";"married";"tertiary";"no";37;"yes";"no";"unknown";5;"may";313;1;-1;0;"unknown";"no"
+35;"technician";"divorced";"secondary";"no";787;"yes";"no";"unknown";5;"may";683;2;-1;0;"unknown";"no"
+35;"services";"married";"secondary";"no";59;"yes";"no";"unknown";5;"may";1077;1;-1;0;"unknown";"no"
+38;"technician";"married";"secondary";"no";253;"yes";"no";"unknown";5;"may";416;1;-1;0;"unknown";"no"
+36;"admin.";"married";"tertiary";"no";211;"yes";"no";"unknown";5;"may";146;2;-1;0;"unknown";"no"
+58;"retired";"married";"primary";"no";235;"yes";"no";"unknown";5;"may";167;1;-1;0;"unknown";"no"
+40;"services";"divorced";"unknown";"no";4384;"yes";"no";"unknown";5;"may";315;1;-1;0;"unknown";"no"
+54;"management";"married";"secondary";"no";4080;"no";"no";"unknown";5;"may";140;1;-1;0;"unknown";"no"
+34;"blue-collar";"single";"secondary";"no";53;"yes";"yes";"unknown";5;"may";346;1;-1;0;"unknown";"no"
+31;"management";"single";"tertiary";"no";0;"yes";"no";"unknown";5;"may";562;1;-1;0;"unknown";"no"
+51;"retired";"married";"secondary";"no";2127;"yes";"no";"unknown";5;"may";172;1;-1;0;"unknown";"no"
+33;"management";"married";"tertiary";"no";377;"yes";"no";"unknown";5;"may";217;1;-1;0;"unknown";"no"
+55;"management";"married";"tertiary";"no";73;"yes";"no";"unknown";5;"may";142;2;-1;0;"unknown";"no"
+42;"admin.";"married";"secondary";"no";445;"yes";"no";"unknown";5;"may";67;1;-1;0;"unknown";"no"
+34;"blue-collar";"married";"secondary";"no";243;"yes";"no";"unknown";5;"may";291;1;-1;0;"unknown";"no"
+33;"blue-collar";"single";"secondary";"no";307;"yes";"no";"unknown";5;"may";309;2;-1;0;"unknown";"no"
+38;"services";"married";"secondary";"no";155;"yes";"no";"unknown";5;"may";248;1;-1;0;"unknown";"no"
+50;"technician";"divorced";"tertiary";"no";173;"no";"yes";"unknown";5;"may";98;1;-1;0;"unknown";"no"
+43;"management";"married";"tertiary";"no";400;"yes";"no";"unknown";5;"may";256;1;-1;0;"unknown";"no"
+61;"blue-collar";"divorced";"primary";"no";1428;"yes";"no";"unknown";5;"may";82;2;-1;0;"unknown";"no"
+47;"admin.";"married";"secondary";"no";219;"yes";"no";"unknown";5;"may";577;1;-1;0;"unknown";"no"
+48;"self-employed";"married";"tertiary";"no";7;"yes";"no";"unknown";5;"may";286;1;-1;0;"unknown";"no"
+44;"blue-collar";"married";"secondary";"no";575;"yes";"no";"unknown";5;"may";477;1;-1;0;"unknown";"no"
+35;"student";"single";"unknown";"no";298;"yes";"no";"unknown";5;"may";611;2;-1;0;"unknown";"no"
+35;"services";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";471;1;-1;0;"unknown";"no"
+50;"services";"married";"secondary";"no";5699;"yes";"no";"unknown";5;"may";381;2;-1;0;"unknown";"no"
+41;"management";"married";"tertiary";"no";176;"yes";"yes";"unknown";5;"may";42;1;-1;0;"unknown";"no"
+41;"management";"married";"tertiary";"no";517;"yes";"no";"unknown";5;"may";251;1;-1;0;"unknown";"no"
+39;"services";"single";"unknown";"no";257;"yes";"no";"unknown";5;"may";408;1;-1;0;"unknown";"no"
+42;"retired";"married";"secondary";"no";56;"yes";"no";"unknown";5;"may";215;1;-1;0;"unknown";"no"
+38;"blue-collar";"married";"secondary";"no";-390;"yes";"no";"unknown";5;"may";287;1;-1;0;"unknown";"no"
+53;"retired";"married";"secondary";"no";330;"yes";"no";"unknown";5;"may";216;2;-1;0;"unknown";"no"
+59;"housemaid";"divorced";"primary";"no";195;"no";"no";"unknown";5;"may";366;2;-1;0;"unknown";"no"
+36;"services";"married";"secondary";"no";301;"yes";"no";"unknown";5;"may";210;1;-1;0;"unknown";"no"
+54;"blue-collar";"married";"primary";"no";-41;"yes";"no";"unknown";5;"may";288;1;-1;0;"unknown";"no"
+40;"technician";"married";"tertiary";"no";483;"yes";"no";"unknown";5;"may";168;1;-1;0;"unknown";"no"
+47;"unknown";"married";"unknown";"no";28;"no";"no";"unknown";5;"may";338;2;-1;0;"unknown";"no"
+53;"unemployed";"married";"unknown";"no";13;"no";"no";"unknown";5;"may";410;3;-1;0;"unknown";"no"
+46;"housemaid";"married";"primary";"no";965;"no";"no";"unknown";5;"may";177;1;-1;0;"unknown";"no"
+39;"management";"married";"tertiary";"no";378;"yes";"yes";"unknown";5;"may";127;2;-1;0;"unknown";"no"
+40;"unemployed";"married";"secondary";"no";219;"yes";"no";"unknown";5;"may";357;1;-1;0;"unknown";"no"
+28;"blue-collar";"married";"primary";"no";324;"yes";"no";"unknown";5;"may";175;1;-1;0;"unknown";"no"
+35;"entrepreneur";"divorced";"secondary";"no";-69;"yes";"no";"unknown";5;"may";300;1;-1;0;"unknown";"no"
+55;"retired";"married";"secondary";"no";0;"no";"yes";"unknown";5;"may";136;1;-1;0;"unknown";"no"
+43;"technician";"divorced";"unknown";"no";205;"yes";"no";"unknown";5;"may";1419;1;-1;0;"unknown";"no"
+48;"blue-collar";"married";"primary";"no";278;"yes";"no";"unknown";5;"may";125;2;-1;0;"unknown";"no"
+58;"management";"married";"unknown";"no";1065;"yes";"no";"unknown";5;"may";213;3;-1;0;"unknown";"no"
+33;"management";"single";"tertiary";"no";34;"yes";"no";"unknown";5;"may";27;1;-1;0;"unknown";"no"
+36;"blue-collar";"married";"unknown";"no";1033;"no";"no";"unknown";5;"may";238;2;-1;0;"unknown";"no"
+53;"services";"divorced";"secondary";"no";1467;"yes";"no";"unknown";5;"may";124;1;-1;0;"unknown";"no"
+47;"blue-collar";"married";"primary";"no";-12;"yes";"no";"unknown";5;"may";18;1;-1;0;"unknown";"no"
+31;"services";"married";"secondary";"no";388;"yes";"no";"unknown";5;"may";730;2;-1;0;"unknown";"no"
+57;"entrepreneur";"married";"secondary";"no";294;"yes";"no";"unknown";5;"may";746;2;-1;0;"unknown";"no"
+53;"blue-collar";"married";"unknown";"no";1827;"no";"no";"unknown";5;"may";121;1;-1;0;"unknown";"no"
+55;"blue-collar";"married";"primary";"no";627;"yes";"no";"unknown";5;"may";247;1;-1;0;"unknown";"no"
+45;"blue-collar";"married";"primary";"no";25;"yes";"no";"unknown";5;"may";40;1;-1;0;"unknown";"no"
+53;"admin.";"divorced";"secondary";"no";315;"yes";"no";"unknown";5;"may";181;2;-1;0;"unknown";"no"
+37;"blue-collar";"married";"primary";"no";0;"yes";"no";"unknown";5;"may";79;1;-1;0;"unknown";"no"
+44;"admin.";"divorced";"secondary";"no";66;"yes";"no";"unknown";5;"may";206;1;-1;0;"unknown";"no"
+49;"blue-collar";"divorced";"primary";"no";-9;"yes";"yes";"unknown";5;"may";389;1;-1;0;"unknown";"no"
+46;"technician";"married";"secondary";"no";349;"yes";"yes";"unknown";5;"may";127;1;-1;0;"unknown";"no"
+43;"entrepreneur";"married";"unknown";"no";100;"yes";"no";"unknown";5;"may";702;1;-1;0;"unknown";"no"
+38;"admin.";"married";"secondary";"no";0;"no";"no";"unknown";5;"may";151;1;-1;0;"unknown";"no"
+43;"technician";"married";"secondary";"no";434;"yes";"no";"unknown";5;"may";117;1;-1;0;"unknown";"no"
+49;"management";"married";"tertiary";"no";3237;"yes";"no";"unknown";5;"may";232;3;-1;0;"unknown";"no"
+42;"management";"married";"unknown";"no";275;"no";"no";"unknown";5;"may";408;2;-1;0;"unknown";"no"
+22;"blue-collar";"single";"secondary";"no";0;"yes";"no";"unknown";5;"may";179;2;-1;0;"unknown";"no"
+40;"management";"married";"tertiary";"no";207;"yes";"no";"unknown";5;"may";39;1;-1;0;"unknown";"no"
+39;"blue-collar";"married";"secondary";"no";483;"yes";"no";"unknown";5;"may";282;1;-1;0;"unknown";"no"
+51;"services";"married";"secondary";"no";2248;"yes";"no";"unknown";5;"may";714;2;-1;0;"unknown";"no"
+49;"admin.";"married";"secondary";"no";428;"yes";"no";"unknown";5;"may";50;1;-1;0;"unknown";"no"
+53;"blue-collar";"married";"secondary";"no";0;"yes";"yes";"unknown";5;"may";181;1;-1;0;"unknown";"no"
+34;"services";"single";"secondary";"no";0;"yes";"no";"unknown";5;"may";142;1;-1;0;"unknown";"no"
+33;"technician";"divorced";"secondary";"no";140;"yes";"no";"unknown";5;"may";227;1;-1;0;"unknown";"no"
+50;"management";"single";"tertiary";"no";297;"yes";"no";"unknown";5;"may";119;1;-1;0;"unknown";"no"
+39;"blue-collar";"married";"primary";"no";279;"yes";"no";"unknown";5;"may";361;1;-1;0;"unknown";"no"
+59;"entrepreneur";"divorced";"secondary";"no";901;"yes";"no";"unknown";5;"may";73;3;-1;0;"unknown";"no"
+30;"technician";"single";"secondary";"no";2573;"yes";"no";"unknown";5;"may";67;2;-1;0;"unknown";"no"
+36;"services";"married";"secondary";"no";143;"yes";"yes";"unknown";5;"may";350;1;-1;0;"unknown";"no"
+40;"blue-collar";"married";"secondary";"no";475;"yes";"no";"unknown";5;"may";332;2;-1;0;"unknown";"no"
+53;"blue-collar";"married";"secondary";"no";70;"yes";"no";"unknown";5;"may";611;2;-1;0;"unknown";"no"
+34;"management";"single";"tertiary";"no";318;"yes";"no";"unknown";5;"may";113;2;-1;0;"unknown";"no"
+37;"technician";"married";"secondary";"no";275;"yes";"no";"unknown";5;"may";132;1;-1;0;"unknown";"no"
+42;"management";"divorced";"tertiary";"no";742;"yes";"no";"unknown";5;"may";58;3;-1;0;"unknown";"no"
+41;"entrepreneur";"married";"primary";"no";236;"yes";"no";"unknown";5;"may";151;1;-1;0;"unknown";"no"
+30;"student";"single";"tertiary";"no";25;"yes";"no";"unknown";5;"may";89;2;-1;0;"unknown";"no"
+37;"management";"single";"tertiary";"no";600;"yes";"no";"unknown";5;"may";152;1;-1;0;"unknown";"no"
+39;"admin.";"divorced";"secondary";"no";-349;"yes";"no";"unknown";5;"may";611;2;-1;0;"unknown";"no"
+41;"blue-collar";"married";"primary";"no";183;"yes";"yes";"unknown";5;"may";110;2;-1;0;"unknown";"no"
+40;"blue-collar";"married";"primary";"no";0;"yes";"no";"unknown";5;"may";463;1;-1;0;"unknown";"no"
+42;"management";"single";"tertiary";"no";0;"yes";"yes";"unknown";5;"may";562;2;-1;0;"unknown";"yes"
+40;"blue-collar";"divorced";"primary";"no";0;"yes";"no";"unknown";5;"may";962;1;-1;0;"unknown";"no"
+57;"technician";"married";"secondary";"no";1078;"yes";"no";"unknown";5;"may";10;4;-1;0;"unknown";"no"
+56;"entrepreneur";"divorced";"secondary";"no";155;"no";"no";"unknown";5;"may";118;3;-1;0;"unknown";"no"
+37;"admin.";"married";"secondary";"no";190;"yes";"no";"unknown";5;"may";92;2;-1;0;"unknown";"no"
+59;"retired";"married";"secondary";"no";319;"yes";"no";"unknown";5;"may";143;3;-1;0;"unknown";"no"
+39;"services";"divorced";"secondary";"no";-185;"yes";"no";"unknown";5;"may";189;3;-1;0;"unknown";"no"
+49;"services";"married";"secondary";"no";47;"no";"no";"unknown";5;"may";234;2;-1;0;"unknown";"no"
+38;"services";"single";"secondary";"no";570;"yes";"no";"unknown";5;"may";75;2;-1;0;"unknown";"no"
+36;"self-employed";"married";"tertiary";"no";19;"no";"no";"unknown";5;"may";189;2;-1;0;"unknown";"no"
+50;"blue-collar";"married";"primary";"no";61;"yes";"no";"unknown";5;"may";621;3;-1;0;"unknown";"no"
+41;"admin.";"married";"secondary";"no";-62;"yes";"yes";"unknown";5;"may";55;2;-1;0;"unknown";"no"
+54;"technician";"married";"tertiary";"no";258;"no";"no";"unknown";5;"may";310;4;-1;0;"unknown";"no"
+58;"blue-collar";"married";"primary";"no";76;"yes";"no";"unknown";5;"may";156;2;-1;0;"unknown";"no"
+30;"blue-collar";"single";"secondary";"no";0;"yes";"no";"unknown";5;"may";5;2;-1;0;"unknown";"no"
+33;"admin.";"single";"secondary";"no";352;"yes";"no";"unknown";5;"may";225;2;-1;0;"unknown";"no"
+47;"admin.";"married";"secondary";"no";368;"yes";"no";"unknown";5;"may";125;2;-1;0;"unknown";"no"
+50;"technician";"single";"tertiary";"no";339;"yes";"no";"unknown";5;"may";2;3;-1;0;"unknown";"no"
+32;"blue-collar";"married";"secondary";"no";1331;"yes";"no";"unknown";5;"may";286;2;-1;0;"unknown";"no"
+40;"self-employed";"married";"secondary";"no";672;"yes";"no";"unknown";5;"may";164;2;-1;0;"unknown";"no"
+37;"management";"married";"tertiary";"no";58;"yes";"no";"unknown";5;"may";98;2;-1;0;"unknown";"no"
+54;"technician";"single";"unknown";"no";447;"yes";"no";"unknown";5;"may";742;2;-1;0;"unknown";"no"
+24;"student";"single";"secondary";"no";423;"yes";"no";"unknown";5;"may";226;3;-1;0;"unknown";"no"
+54;"management";"married";"tertiary";"no";0;"no";"no";"unknown";5;"may";120;2;-1;0;"unknown";"no"
+34;"technician";"married";"secondary";"no";19;"yes";"no";"unknown";5;"may";362;4;-1;0;"unknown";"no"
+56;"technician";"divorced";"primary";"no";13;"yes";"no";"unknown";5;"may";357;2;-1;0;"unknown";"no"
+31;"blue-collar";"single";"secondary";"no";3;"yes";"no";"unknown";5;"may";200;2;-1;0;"unknown";"no"
+24;"student";"single";"secondary";"no";82;"yes";"no";"unknown";5;"may";204;2;-1;0;"unknown";"no"
+42;"blue-collar";"divorced";"primary";"no";28;"yes";"no";"unknown";5;"may";126;3;-1;0;"unknown";"no"
+57;"technician";"married";"secondary";"no";792;"yes";"no";"unknown";5;"may";65;2;-1;0;"unknown";"no"
+42;"blue-collar";"married";"unknown";"no";408;"yes";"no";"unknown";5;"may";107;2;-1;0;"unknown";"no"
+51;"admin.";"married";"secondary";"no";531;"yes";"no";"unknown";5;"may";267;2;-1;0;"unknown";"no"
+57;"retired";"married";"secondary";"no";211;"yes";"no";"unknown";5;"may";248;2;-1;0;"unknown";"no"
+36;"services";"single";"secondary";"no";62;"yes";"no";"unknown";5;"may";215;2;-1;0;"unknown";"no"
+53;"services";"married";"unknown";"no";257;"yes";"no";"unknown";5;"may";209;2;-1;0;"unknown";"no"
+50;"technician";"married";"secondary";"no";1234;"yes";"no";"unknown";5;"may";205;2;-1;0;"unknown";"no"
+54;"management";"married";"tertiary";"no";313;"yes";"no";"unknown";5;"may";83;2;-1;0;"unknown";"no"
+55;"blue-collar";"married";"secondary";"no";89;"yes";"no";"unknown";5;"may";106;3;-1;0;"unknown";"no"
+44;"blue-collar";"married";"secondary";"no";129;"yes";"yes";"unknown";5;"may";189;2;-1;0;"unknown";"no"
+43;"management";"married";"unknown";"no";0;"yes";"no";"unknown";5;"may";105;2;-1;0;"unknown";"no"
+56;"admin.";"married";"secondary";"no";353;"yes";"no";"unknown";5;"may";106;2;-1;0;"unknown";"no"
+54;"technician";"married";"unknown";"no";851;"yes";"no";"unknown";5;"may";108;2;-1;0;"unknown";"no"
+55;"services";"divorced";"primary";"no";96;"yes";"yes";"unknown";5;"may";311;2;-1;0;"unknown";"no"
+37;"services";"divorced";"secondary";"no";398;"yes";"yes";"unknown";5;"may";214;2;-1;0;"unknown";"no"
+33;"admin.";"single";"tertiary";"no";193;"no";"no";"unknown";5;"may";132;2;-1;0;"unknown";"no"
+46;"admin.";"married";"secondary";"no";-358;"yes";"no";"unknown";5;"may";358;2;-1;0;"unknown";"no"
+36;"blue-collar";"married";"secondary";"no";539;"yes";"yes";"unknown";5;"may";453;2;-1;0;"unknown";"no"
+51;"technician";"single";"secondary";"no";0;"yes";"no";"unknown";5;"may";364;2;-1;0;"unknown";"no"
+40;"retired";"single";"primary";"no";0;"no";"no";"unknown";5;"may";136;2;-1;0;"unknown";"no"
+42;"blue-collar";"married";"secondary";"no";490;"yes";"no";"unknown";5;"may";386;2;-1;0;"unknown";"no"
+51;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";173;2;-1;0;"unknown";"no"
+49;"blue-collar";"married";"unknown";"no";403;"yes";"no";"unknown";5;"may";241;2;-1;0;"unknown";"no"
+48;"management";"married";"secondary";"no";161;"yes";"no";"unknown";5;"may";224;3;-1;0;"unknown";"no"
+32;"technician";"divorced";"tertiary";"no";2558;"no";"no";"unknown";5;"may";148;2;-1;0;"unknown";"no"
+31;"admin.";"single";"secondary";"no";98;"yes";"no";"unknown";5;"may";196;2;-1;0;"unknown";"no"
+55;"management";"single";"tertiary";"no";115;"no";"no";"unknown";5;"may";111;4;-1;0;"unknown";"no"
+40;"blue-collar";"single";"secondary";"no";436;"yes";"no";"unknown";5;"may";231;3;-1;0;"unknown";"no"
+47;"technician";"married";"tertiary";"no";831;"yes";"no";"unknown";5;"may";316;3;-1;0;"unknown";"no"
+57;"technician";"married";"unknown";"no";206;"yes";"no";"unknown";5;"may";216;3;-1;0;"unknown";"no"
+41;"blue-collar";"married";"secondary";"no";290;"yes";"no";"unknown";5;"may";240;2;-1;0;"unknown";"no"
+48;"blue-collar";"married";"secondary";"no";1;"no";"no";"unknown";5;"may";669;3;-1;0;"unknown";"no"
+42;"blue-collar";"married";"unknown";"no";57;"yes";"no";"unknown";5;"may";425;2;-1;0;"unknown";"no"
+30;"blue-collar";"single";"secondary";"no";-457;"yes";"no";"unknown";5;"may";143;2;-1;0;"unknown";"no"
+58;"management";"single";"tertiary";"no";1387;"yes";"no";"unknown";5;"may";174;5;-1;0;"unknown";"no"
+45;"management";"divorced";"tertiary";"no";24598;"yes";"no";"unknown";5;"may";313;3;-1;0;"unknown";"no"
+49;"blue-collar";"married";"secondary";"no";30;"yes";"no";"unknown";5;"may";135;4;-1;0;"unknown";"no"
+42;"admin.";"single";"secondary";"no";1022;"yes";"no";"unknown";5;"may";146;2;-1;0;"unknown";"no"
+53;"technician";"married";"secondary";"no";56;"yes";"yes";"unknown";5;"may";152;2;-1;0;"unknown";"no"
+51;"admin.";"single";"secondary";"yes";-2;"no";"no";"unknown";5;"may";402;3;-1;0;"unknown";"no"
+32;"services";"single";"secondary";"no";121;"yes";"no";"unknown";5;"may";213;2;-1;0;"unknown";"no"
+41;"blue-collar";"single";"secondary";"no";842;"yes";"no";"unknown";5;"may";144;3;-1;0;"unknown";"no"
+43;"management";"divorced";"secondary";"no";693;"yes";"no";"unknown";5;"may";124;3;-1;0;"unknown";"no"
+40;"blue-collar";"divorced";"secondary";"no";-333;"yes";"no";"unknown";5;"may";183;2;-1;0;"unknown";"no"
+50;"blue-collar";"married";"primary";"no";1533;"yes";"no";"unknown";5;"may";325;2;-1;0;"unknown";"no"
+34;"management";"married";"tertiary";"no";46;"yes";"no";"unknown";5;"may";39;4;-1;0;"unknown";"no"
+53;"services";"married";"unknown";"no";18;"no";"no";"unknown";5;"may";503;2;-1;0;"unknown";"no"
+45;"technician";"married";"secondary";"no";44;"yes";"no";"unknown";5;"may";95;4;-1;0;"unknown";"no"
+39;"blue-collar";"married";"primary";"no";-100;"yes";"no";"unknown";5;"may";680;2;-1;0;"unknown";"no"
+44;"services";"married";"tertiary";"no";510;"yes";"no";"unknown";5;"may";421;4;-1;0;"unknown";"no"
+55;"management";"married";"tertiary";"no";685;"yes";"no";"unknown";5;"may";174;3;-1;0;"unknown";"no"
+46;"management";"single";"tertiary";"no";187;"yes";"no";"unknown";5;"may";113;2;-1;0;"unknown";"no"
+45;"blue-collar";"married";"secondary";"no";66;"yes";"no";"unknown";5;"may";808;2;-1;0;"unknown";"no"
+34;"admin.";"married";"secondary";"no";560;"yes";"no";"unknown";5;"may";198;3;-1;0;"unknown";"no"
+36;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";195;2;-1;0;"unknown";"no"
+59;"unknown";"divorced";"unknown";"no";27;"no";"no";"unknown";5;"may";347;3;-1;0;"unknown";"no"
+31;"admin.";"single";"secondary";"no";12;"yes";"no";"unknown";5;"may";208;2;-1;0;"unknown";"no"
+44;"blue-collar";"single";"secondary";"no";34;"yes";"no";"unknown";5;"may";404;4;-1;0;"unknown";"no"
+33;"entrepreneur";"single";"tertiary";"no";1068;"yes";"no";"unknown";5;"may";396;2;-1;0;"unknown";"no"
+40;"blue-collar";"married";"secondary";"no";211;"yes";"no";"unknown";5;"may";216;4;-1;0;"unknown";"no"
+46;"admin.";"single";"tertiary";"no";377;"yes";"no";"unknown";5;"may";98;2;-1;0;"unknown";"no"
+48;"management";"married";"tertiary";"no";263;"yes";"no";"unknown";5;"may";350;2;-1;0;"unknown";"no"
+42;"services";"married";"secondary";"no";1263;"yes";"no";"unknown";6;"may";114;2;-1;0;"unknown";"no"
+27;"services";"married";"secondary";"no";8;"yes";"no";"unknown";6;"may";88;3;-1;0;"unknown";"no"
+48;"admin.";"married";"secondary";"no";126;"yes";"yes";"unknown";6;"may";379;2;-1;0;"unknown";"no"
+59;"admin.";"married";"secondary";"no";230;"yes";"no";"unknown";6;"may";190;3;-1;0;"unknown";"no"
+46;"technician";"married";"tertiary";"no";841;"yes";"no";"unknown";6;"may";158;1;-1;0;"unknown";"no"
+38;"admin.";"divorced";"secondary";"no";308;"yes";"no";"unknown";6;"may";102;1;-1;0;"unknown";"no"
+43;"management";"divorced";"tertiary";"no";1;"yes";"no";"unknown";6;"may";306;1;-1;0;"unknown";"no"
+38;"admin.";"divorced";"tertiary";"no";86;"yes";"no";"unknown";6;"may";218;1;-1;0;"unknown";"no"
+23;"student";"single";"secondary";"no";157;"yes";"no";"unknown";6;"may";54;1;-1;0;"unknown";"no"
+34;"services";"married";"secondary";"no";22;"yes";"no";"unknown";6;"may";344;1;-1;0;"unknown";"no"
+38;"admin.";"married";"secondary";"no";46;"yes";"yes";"unknown";6;"may";195;1;-1;0;"unknown";"no"
+37;"blue-collar";"married";"secondary";"no";1293;"no";"no";"unknown";6;"may";652;1;-1;0;"unknown";"no"
+25;"admin.";"single";"secondary";"no";122;"yes";"no";"unknown";6;"may";286;1;-1;0;"unknown";"no"
+48;"blue-collar";"married";"unknown";"no";131;"yes";"no";"unknown";6;"may";189;1;-1;0;"unknown";"no"
+49;"blue-collar";"single";"secondary";"no";143;"yes";"no";"unknown";6;"may";83;1;-1;0;"unknown";"no"
+38;"admin.";"single";"secondary";"no";393;"no";"no";"unknown";6;"may";184;2;-1;0;"unknown";"no"
+43;"blue-collar";"married";"primary";"no";98;"yes";"no";"unknown";6;"may";235;1;-1;0;"unknown";"no"
+33;"management";"married";"tertiary";"no";0;"yes";"no";"unknown";6;"may";290;1;-1;0;"unknown";"no"
+55;"blue-collar";"married";"secondary";"no";224;"yes";"no";"unknown";6;"may";232;1;-1;0;"unknown";"no"
+38;"blue-collar";"married";"secondary";"no";757;"yes";"no";"unknown";6;"may";133;1;-1;0;"unknown";"no"
+49;"services";"married";"secondary";"no";245;"yes";"yes";"unknown";6;"may";318;1;-1;0;"unknown";"no"
+40;"management";"married";"secondary";"no";8486;"no";"no";"unknown";6;"may";260;3;-1;0;"unknown";"no"
+43;"admin.";"married";"unknown";"no";350;"no";"no";"unknown";6;"may";437;1;-1;0;"unknown";"no"
+38;"blue-collar";"married";"secondary";"no";20;"yes";"no";"unknown";6;"may";402;1;-1;0;"unknown";"no"
+58;"services";"married";"secondary";"no";1667;"yes";"yes";"unknown";6;"may";85;1;-1;0;"unknown";"no"
+57;"technician";"married";"unknown";"no";345;"yes";"no";"unknown";6;"may";125;1;-1;0;"unknown";"no"
+32;"unemployed";"married";"secondary";"no";10;"yes";"no";"unknown";6;"may";501;4;-1;0;"unknown";"no"
+56;"management";"married";"tertiary";"no";830;"yes";"yes";"unknown";6;"may";1201;1;-1;0;"unknown";"yes"
+58;"blue-collar";"divorced";"unknown";"no";29;"yes";"no";"unknown";6;"may";253;1;-1;0;"unknown";"no"
+60;"retired";"divorced";"secondary";"no";545;"yes";"no";"unknown";6;"may";1030;1;-1;0;"unknown";"yes"
+37;"technician";"married";"tertiary";"no";8730;"yes";"no";"unknown";6;"may";149;1;-1;0;"unknown";"no"
+46;"technician";"divorced";"tertiary";"no";477;"yes";"no";"unknown";6;"may";114;1;-1;0;"unknown";"no"
+27;"admin.";"married";"secondary";"no";4;"yes";"no";"unknown";6;"may";243;1;-1;0;"unknown";"no"
+43;"blue-collar";"married";"secondary";"no";1205;"yes";"no";"unknown";6;"may";769;2;-1;0;"unknown";"no"
+32;"technician";"single";"secondary";"no";0;"yes";"yes";"unknown";6;"may";135;3;-1;0;"unknown";"no"
+40;"admin.";"single";"secondary";"no";263;"yes";"no";"unknown";6;"may";231;1;-1;0;"unknown";"no"
+38;"admin.";"married";"secondary";"no";1;"no";"no";"unknown";6;"may";76;1;-1;0;"unknown";"no"
+34;"blue-collar";"married";"secondary";"no";283;"no";"yes";"unknown";6;"may";199;1;-1;0;"unknown";"no"
+47;"blue-collar";"married";"primary";"no";206;"yes";"no";"unknown";6;"may";152;1;-1;0;"unknown";"no"
+42;"housemaid";"married";"primary";"no";17;"yes";"no";"unknown";6;"may";124;1;-1;0;"unknown";"no"
+48;"technician";"married";"secondary";"no";141;"yes";"yes";"unknown";6;"may";424;1;-1;0;"unknown";"no"
+29;"self-employed";"single";"tertiary";"no";16;"yes";"no";"unknown";6;"may";43;1;-1;0;"unknown";"no"
+50;"services";"married";"secondary";"no";206;"yes";"no";"unknown";6;"may";154;1;-1;0;"unknown";"no"
+52;"technician";"married";"unknown";"no";7;"no";"no";"unknown";6;"may";203;2;-1;0;"unknown";"no"
+50;"management";"married";"tertiary";"no";0;"no";"no";"unknown";6;"may";326;1;-1;0;"unknown";"no"
+58;"retired";"married";"tertiary";"no";0;"no";"no";"unknown";6;"may";393;1;-1;0;"unknown";"no"
+46;"blue-collar";"divorced";"primary";"no";1927;"yes";"no";"unknown";6;"may";241;3;-1;0;"unknown";"no"
+38;"technician";"married";"secondary";"no";284;"yes";"no";"unknown";6;"may";483;1;-1;0;"unknown";"no"
+46;"blue-collar";"married";"secondary";"no";1660;"yes";"no";"unknown";6;"may";259;1;-1;0;"unknown";"no"
+32;"services";"single";"secondary";"no";406;"yes";"no";"unknown";6;"may";227;1;-1;0;"unknown";"no"
+51;"blue-collar";"married";"primary";"no";230;"yes";"no";"unknown";6;"may";673;1;-1;0;"unknown";"no"
+39;"admin.";"single";"secondary";"no";-25;"yes";"no";"unknown";6;"may";576;1;-1;0;"unknown";"no"
+48;"admin.";"married";"secondary";"no";182;"yes";"no";"unknown";6;"may";180;2;-1;0;"unknown";"no"
+36;"entrepreneur";"married";"tertiary";"no";1169;"yes";"no";"unknown";6;"may";168;2;-1;0;"unknown";"no"
+34;"admin.";"divorced";"secondary";"no";67;"yes";"no";"unknown";6;"may";90;1;-1;0;"unknown";"no"
+40;"technician";"married";"secondary";"no";77;"no";"no";"unknown";6;"may";505;1;-1;0;"unknown";"no"
+43;"unemployed";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";245;1;-1;0;"unknown";"no"
+52;"blue-collar";"divorced";"primary";"no";55;"yes";"yes";"unknown";6;"may";186;1;-1;0;"unknown";"no"
+33;"technician";"married";"secondary";"yes";72;"yes";"no";"unknown";6;"may";623;1;-1;0;"unknown";"no"
+49;"management";"single";"tertiary";"no";163;"yes";"no";"unknown";6;"may";496;3;-1;0;"unknown";"no"
+32;"management";"single";"tertiary";"no";151;"yes";"no";"unknown";6;"may";118;1;-1;0;"unknown";"no"
+39;"admin.";"single";"secondary";"no";113;"yes";"no";"unknown";6;"may";342;1;-1;0;"unknown";"no"
+40;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";225;1;-1;0;"unknown";"no"
+38;"technician";"single";"tertiary";"no";9;"yes";"no";"unknown";6;"may";185;3;-1;0;"unknown";"no"
+43;"management";"married";"secondary";"no";375;"yes";"no";"unknown";6;"may";196;2;-1;0;"unknown";"no"
+39;"services";"married";"secondary";"no";1142;"yes";"no";"unknown";6;"may";276;1;-1;0;"unknown";"no"
+54;"blue-collar";"married";"primary";"no";2102;"yes";"no";"unknown";6;"may";76;1;-1;0;"unknown";"no"
+38;"technician";"single";"tertiary";"no";4325;"yes";"no";"unknown";6;"may";87;1;-1;0;"unknown";"no"
+40;"blue-collar";"married";"secondary";"no";217;"yes";"no";"unknown";6;"may";195;1;-1;0;"unknown";"no"
+55;"admin.";"married";"secondary";"no";131;"yes";"no";"unknown";6;"may";744;1;-1;0;"unknown";"no"
+42;"management";"married";"tertiary";"no";1680;"yes";"no";"unknown";6;"may";765;1;-1;0;"unknown";"no"
+39;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";262;1;-1;0;"unknown";"no"
+40;"blue-collar";"married";"primary";"no";46;"yes";"no";"unknown";6;"may";119;1;-1;0;"unknown";"no"
+32;"blue-collar";"married";"secondary";"no";320;"yes";"no";"unknown";6;"may";198;2;-1;0;"unknown";"no"
+55;"admin.";"married";"secondary";"no";183;"yes";"no";"unknown";6;"may";150;1;-1;0;"unknown";"no"
+45;"blue-collar";"married";"secondary";"no";39;"no";"no";"unknown";6;"may";241;1;-1;0;"unknown";"no"
+35;"management";"single";"tertiary";"no";560;"yes";"no";"unknown";6;"may";181;1;-1;0;"unknown";"no"
+58;"technician";"divorced";"secondary";"no";469;"no";"no";"unknown";6;"may";196;1;-1;0;"unknown";"no"
+35;"admin.";"married";"secondary";"no";530;"yes";"no";"unknown";6;"may";149;1;-1;0;"unknown";"no"
+49;"services";"married";"primary";"no";61;"yes";"no";"unknown";6;"may";246;1;-1;0;"unknown";"no"
+34;"technician";"single";"tertiary";"no";242;"yes";"no";"unknown";6;"may";112;1;-1;0;"unknown";"no"
+36;"blue-collar";"married";"secondary";"no";139;"yes";"no";"unknown";6;"may";309;2;-1;0;"unknown";"no"
+24;"self-employed";"single";"secondary";"no";0;"yes";"no";"unknown";6;"may";278;1;-1;0;"unknown";"no"
+34;"technician";"married";"secondary";"no";367;"yes";"no";"unknown";6;"may";140;1;-1;0;"unknown";"no"
+51;"admin.";"divorced";"secondary";"no";228;"yes";"no";"unknown";6;"may";136;1;-1;0;"unknown";"no"
+39;"technician";"single";"unknown";"no";45248;"yes";"no";"unknown";6;"may";1623;1;-1;0;"unknown";"yes"
+50;"self-employed";"married";"unknown";"no";-84;"yes";"no";"unknown";6;"may";101;1;-1;0;"unknown";"no"
+32;"services";"single";"secondary";"no";310;"yes";"no";"unknown";6;"may";144;1;-1;0;"unknown";"no"
+42;"blue-collar";"married";"unknown";"no";132;"yes";"no";"unknown";6;"may";238;1;-1;0;"unknown";"no"
+50;"technician";"married";"secondary";"no";797;"yes";"no";"unknown";6;"may";354;1;-1;0;"unknown";"no"
+40;"services";"married";"secondary";"no";71;"no";"no";"unknown";6;"may";79;1;-1;0;"unknown";"no"
+46;"management";"divorced";"unknown";"no";2;"yes";"no";"unknown";6;"may";187;1;-1;0;"unknown";"no"
+37;"management";"married";"tertiary";"no";231;"yes";"yes";"unknown";6;"may";451;2;-1;0;"unknown";"no"
+34;"blue-collar";"married";"secondary";"no";270;"yes";"yes";"unknown";6;"may";159;1;-1;0;"unknown";"no"
+44;"blue-collar";"married";"secondary";"no";274;"yes";"yes";"unknown";6;"may";409;1;-1;0;"unknown";"no"
+40;"admin.";"single";"secondary";"no";-109;"yes";"yes";"unknown";6;"may";170;1;-1;0;"unknown";"no"
+37;"technician";"married";"secondary";"no";1;"yes";"no";"unknown";6;"may";608;1;-1;0;"unknown";"yes"
+33;"blue-collar";"single";"secondary";"yes";-60;"no";"no";"unknown";6;"may";243;1;-1;0;"unknown";"no"
+35;"blue-collar";"married";"secondary";"no";89;"yes";"no";"unknown";6;"may";145;2;-1;0;"unknown";"no"
+58;"blue-collar";"divorced";"secondary";"no";-11;"no";"no";"unknown";6;"may";112;1;-1;0;"unknown";"no"
+39;"blue-collar";"married";"primary";"no";0;"yes";"no";"unknown";6;"may";262;2;-1;0;"unknown";"no"
+43;"blue-collar";"married";"secondary";"no";-509;"yes";"no";"unknown";6;"may";124;1;-1;0;"unknown";"no"
+39;"unemployed";"married";"primary";"no";408;"yes";"no";"unknown";6;"may";53;1;-1;0;"unknown";"no"
+36;"services";"single";"primary";"no";58;"yes";"no";"unknown";6;"may";134;1;-1;0;"unknown";"no"
+57;"retired";"single";"secondary";"no";1640;"no";"yes";"unknown";6;"may";204;4;-1;0;"unknown";"no"
+36;"admin.";"single";"secondary";"no";20;"yes";"no";"unknown";6;"may";186;1;-1;0;"unknown";"no"
+50;"blue-collar";"married";"primary";"no";71;"yes";"no";"unknown";6;"may";678;1;-1;0;"unknown";"no"
+55;"blue-collar";"married";"secondary";"no";52;"yes";"no";"unknown";6;"may";182;1;-1;0;"unknown";"no"
+44;"self-employed";"married";"tertiary";"no";292;"yes";"no";"unknown";6;"may";162;1;-1;0;"unknown";"no"
+44;"services";"divorced";"secondary";"no";424;"yes";"no";"unknown";6;"may";27;1;-1;0;"unknown";"no"
+39;"housemaid";"single";"primary";"no";109;"yes";"no";"unknown";6;"may";699;3;-1;0;"unknown";"no"
+46;"blue-collar";"married";"unknown";"no";1044;"yes";"no";"unknown";6;"may";43;1;-1;0;"unknown";"no"
+39;"blue-collar";"married";"secondary";"no";983;"yes";"no";"unknown";6;"may";97;1;-1;0;"unknown";"no"
+34;"admin.";"married";"secondary";"no";869;"no";"no";"unknown";6;"may";1677;1;-1;0;"unknown";"yes"
+40;"blue-collar";"married";"primary";"no";668;"yes";"no";"unknown";6;"may";283;2;-1;0;"unknown";"no"
+50;"management";"married";"tertiary";"no";964;"yes";"no";"unknown";6;"may";323;1;-1;0;"unknown";"no"
+31;"management";"single";"secondary";"no";301;"yes";"no";"unknown";6;"may";82;1;-1;0;"unknown";"no"
+37;"admin.";"single";"secondary";"no";140;"yes";"no";"unknown";6;"may";310;1;-1;0;"unknown";"no"
+39;"management";"single";"secondary";"no";1877;"yes";"no";"unknown";6;"may";185;1;-1;0;"unknown";"no"
+51;"blue-collar";"married";"primary";"no";1127;"yes";"no";"unknown";6;"may";47;1;-1;0;"unknown";"no"
+41;"technician";"married";"secondary";"no";871;"yes";"no";"unknown";6;"may";145;1;-1;0;"unknown";"no"
+41;"technician";"married";"secondary";"no";767;"yes";"yes";"unknown";6;"may";204;1;-1;0;"unknown";"no"
+43;"blue-collar";"married";"secondary";"no";0;"no";"no";"unknown";6;"may";187;1;-1;0;"unknown";"no"
+30;"services";"single";"secondary";"no";209;"yes";"no";"unknown";6;"may";30;2;-1;0;"unknown";"no"
+54;"management";"divorced";"primary";"no";0;"no";"no";"unknown";6;"may";472;1;-1;0;"unknown";"no"
+43;"blue-collar";"divorced";"secondary";"no";110;"yes";"yes";"unknown";6;"may";448;1;-1;0;"unknown";"no"
+59;"management";"divorced";"tertiary";"no";-76;"yes";"yes";"unknown";6;"may";264;1;-1;0;"unknown";"no"
+47;"technician";"married";"unknown";"no";178;"yes";"no";"unknown";6;"may";169;1;-1;0;"unknown";"no"
+40;"blue-collar";"married";"primary";"no";-66;"yes";"no";"unknown";6;"may";288;1;-1;0;"unknown";"no"
+32;"technician";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";176;2;-1;0;"unknown";"no"
+29;"blue-collar";"married";"secondary";"no";1;"yes";"no";"unknown";6;"may";215;1;-1;0;"unknown";"no"
+36;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";337;1;-1;0;"unknown";"no"
+55;"unemployed";"married";"tertiary";"no";5345;"no";"no";"unknown";6;"may";278;1;-1;0;"unknown";"no"
+30;"blue-collar";"divorced";"secondary";"no";-209;"yes";"no";"unknown";6;"may";188;2;-1;0;"unknown";"no"
+39;"admin.";"single";"secondary";"no";0;"yes";"no";"unknown";6;"may";174;2;-1;0;"unknown";"no"
+39;"blue-collar";"divorced";"secondary";"no";42;"yes";"no";"unknown";6;"may";226;2;-1;0;"unknown";"no"
+50;"blue-collar";"divorced";"secondary";"no";41;"yes";"no";"unknown";6;"may";190;1;-1;0;"unknown";"no"
+44;"blue-collar";"married";"secondary";"no";-99;"yes";"no";"unknown";6;"may";111;2;-1;0;"unknown";"no"
+37;"technician";"single";"secondary";"no";17;"yes";"no";"unknown";6;"may";164;1;-1;0;"unknown";"no"
+46;"admin.";"married";"primary";"no";276;"yes";"yes";"unknown";6;"may";157;2;-1;0;"unknown";"no"
+32;"technician";"single";"unknown";"no";-170;"no";"no";"unknown";6;"may";46;1;-1;0;"unknown";"no"
+37;"management";"single";"tertiary";"no";230;"yes";"yes";"unknown";6;"may";374;1;-1;0;"unknown";"no"
+29;"blue-collar";"married";"secondary";"no";9;"yes";"no";"unknown";6;"may";349;1;-1;0;"unknown";"no"
+41;"blue-collar";"married";"secondary";"no";946;"yes";"no";"unknown";6;"may";325;1;-1;0;"unknown";"no"
+45;"blue-collar";"married";"primary";"no";1297;"yes";"no";"unknown";6;"may";233;2;-1;0;"unknown";"no"
+57;"retired";"divorced";"secondary";"no";-331;"yes";"no";"unknown";6;"may";531;1;-1;0;"unknown";"no"
+48;"blue-collar";"single";"secondary";"no";44;"yes";"no";"unknown";6;"may";153;1;-1;0;"unknown";"no"
+60;"retired";"married";"secondary";"yes";15;"no";"no";"unknown";6;"may";80;1;-1;0;"unknown";"no"
+26;"admin.";"single";"secondary";"no";712;"yes";"no";"unknown";6;"may";232;1;-1;0;"unknown";"no"
+58;"retired";"married";"secondary";"no";5435;"yes";"no";"unknown";6;"may";118;1;-1;0;"unknown";"no"
+34;"admin.";"married";"secondary";"no";507;"yes";"no";"unknown";6;"may";190;1;-1;0;"unknown";"no"
+55;"unemployed";"divorced";"secondary";"no";387;"yes";"no";"unknown";6;"may";918;1;-1;0;"unknown";"yes"
+41;"blue-collar";"married";"primary";"no";0;"yes";"yes";"unknown";6;"may";238;1;-1;0;"unknown";"no"
+50;"management";"divorced";"secondary";"no";1716;"yes";"no";"unknown";6;"may";82;1;-1;0;"unknown";"no"
+49;"entrepreneur";"married";"secondary";"no";167;"yes";"yes";"unknown";6;"may";198;3;-1;0;"unknown";"no"
+44;"admin.";"married";"unknown";"no";40;"no";"yes";"unknown";6;"may";160;2;-1;0;"unknown";"no"
+44;"blue-collar";"married";"primary";"no";148;"yes";"no";"unknown";6;"may";211;1;-1;0;"unknown";"no"
+31;"technician";"married";"secondary";"no";17;"yes";"yes";"unknown";6;"may";120;1;-1;0;"unknown";"no"
+34;"blue-collar";"single";"tertiary";"no";1011;"yes";"no";"unknown";6;"may";136;1;-1;0;"unknown";"no"
+46;"management";"single";"unknown";"no";1527;"yes";"no";"unknown";6;"may";269;1;-1;0;"unknown";"no"
+42;"management";"married";"tertiary";"no";744;"no";"no";"unknown";6;"may";157;1;-1;0;"unknown";"no"
+52;"admin.";"married";"secondary";"no";484;"yes";"no";"unknown";6;"may";128;1;-1;0;"unknown";"no"
+29;"management";"single";"tertiary";"no";0;"yes";"no";"unknown";6;"may";211;1;-1;0;"unknown";"no"
+53;"retired";"married";"primary";"no";136;"yes";"no";"unknown";6;"may";267;2;-1;0;"unknown";"no"
+43;"blue-collar";"married";"secondary";"no";1335;"yes";"no";"unknown";6;"may";371;2;-1;0;"unknown";"no"
+38;"management";"married";"secondary";"no";517;"yes";"no";"unknown";6;"may";288;2;-1;0;"unknown";"no"
+46;"management";"married";"tertiary";"no";459;"yes";"no";"unknown";6;"may";221;1;-1;0;"unknown";"no"
+48;"management";"divorced";"unknown";"no";549;"yes";"no";"unknown";6;"may";427;1;-1;0;"unknown";"no"
+30;"admin.";"divorced";"secondary";"no";83;"yes";"yes";"unknown";6;"may";310;1;-1;0;"unknown";"no"
+44;"blue-collar";"married";"primary";"no";213;"no";"no";"unknown";6;"may";158;1;-1;0;"unknown";"no"
+31;"housemaid";"married";"primary";"no";203;"yes";"no";"unknown";6;"may";604;3;-1;0;"unknown";"no"
+42;"services";"single";"secondary";"no";518;"yes";"no";"unknown";6;"may";198;1;-1;0;"unknown";"no"
+40;"management";"single";"tertiary";"no";3877;"yes";"no";"unknown";6;"may";145;1;-1;0;"unknown";"no"
+52;"admin.";"married";"secondary";"no";1236;"yes";"no";"unknown";6;"may";247;1;-1;0;"unknown";"no"
+45;"blue-collar";"divorced";"secondary";"no";756;"yes";"no";"unknown";6;"may";179;2;-1;0;"unknown";"no"
+48;"blue-collar";"married";"secondary";"no";157;"yes";"no";"unknown";6;"may";73;1;-1;0;"unknown";"no"
+45;"blue-collar";"married";"primary";"no";-66;"yes";"no";"unknown";6;"may";263;2;-1;0;"unknown";"no"
+34;"blue-collar";"married";"unknown";"no";245;"yes";"no";"unknown";6;"may";13;1;-1;0;"unknown";"no"
+34;"blue-collar";"married";"primary";"no";-144;"yes";"no";"unknown";6;"may";79;2;-1;0;"unknown";"no"
+46;"blue-collar";"married";"secondary";"no";71;"yes";"no";"unknown";6;"may";416;1;-1;0;"unknown";"no"
+49;"services";"divorced";"secondary";"no";505;"yes";"no";"unknown";6;"may";162;1;-1;0;"unknown";"no"
+50;"technician";"married";"primary";"no";249;"yes";"no";"unknown";6;"may";129;1;-1;0;"unknown";"no"
+34;"admin.";"single";"secondary";"no";0;"yes";"no";"unknown";6;"may";150;1;-1;0;"unknown";"no"
+40;"unemployed";"single";"secondary";"no";11;"yes";"no";"unknown";6;"may";43;1;-1;0;"unknown";"no"
+36;"admin.";"married";"secondary";"no";639;"yes";"no";"unknown";6;"may";191;1;-1;0;"unknown";"no"
+59;"blue-collar";"divorced";"unknown";"no";124;"yes";"no";"unknown";6;"may";26;1;-1;0;"unknown";"no"
+45;"blue-collar";"married";"secondary";"no";82;"yes";"no";"unknown";6;"may";250;1;-1;0;"unknown";"no"
+36;"self-employed";"married";"tertiary";"no";107;"yes";"no";"unknown";6;"may";146;1;-1;0;"unknown";"no"
+56;"services";"married";"secondary";"no";473;"yes";"no";"unknown";6;"may";416;1;-1;0;"unknown";"no"
+42;"services";"divorced";"secondary";"no";372;"yes";"yes";"unknown";6;"may";121;2;-1;0;"unknown";"no"
+30;"admin.";"married";"secondary";"no";46;"yes";"no";"unknown";6;"may";114;2;-1;0;"unknown";"no"
+30;"student";"single";"tertiary";"no";34;"yes";"no";"unknown";6;"may";289;1;-1;0;"unknown";"no"
+47;"self-employed";"married";"unknown";"no";935;"yes";"no";"unknown";6;"may";225;1;-1;0;"unknown";"no"
+33;"blue-collar";"married";"secondary";"no";-10;"yes";"no";"unknown";6;"may";123;1;-1;0;"unknown";"no"
+36;"admin.";"married";"secondary";"no";-106;"yes";"no";"unknown";6;"may";130;2;-1;0;"unknown";"no"
+39;"services";"divorced";"primary";"no";471;"yes";"no";"unknown";6;"may";161;2;-1;0;"unknown";"no"
+56;"admin.";"divorced";"secondary";"no";778;"yes";"no";"unknown";6;"may";149;2;-1;0;"unknown";"no"
+39;"blue-collar";"divorced";"unknown";"no";170;"yes";"no";"unknown";6;"may";268;2;-1;0;"unknown";"no"
+42;"technician";"married";"secondary";"no";315;"yes";"no";"unknown";6;"may";259;2;-1;0;"unknown";"no"
+52;"blue-collar";"married";"secondary";"no";3165;"no";"no";"unknown";6;"may";26;1;-1;0;"unknown";"no"
+36;"admin.";"divorced";"secondary";"no";131;"yes";"no";"unknown";6;"may";153;1;-1;0;"unknown";"no"
+35;"entrepreneur";"married";"secondary";"yes";204;"yes";"no";"unknown";6;"may";424;2;-1;0;"unknown";"no"
+47;"technician";"married";"secondary";"no";83;"yes";"no";"unknown";6;"may";179;2;-1;0;"unknown";"no"
+59;"services";"divorced";"secondary";"no";0;"yes";"yes";"unknown";6;"may";97;1;-1;0;"unknown";"no"
+57;"blue-collar";"married";"primary";"no";5431;"yes";"yes";"unknown";6;"may";383;1;-1;0;"unknown";"no"
+38;"management";"married";"unknown";"no";1759;"yes";"no";"unknown";6;"may";440;1;-1;0;"unknown";"no"
+46;"unemployed";"married";"secondary";"no";-125;"yes";"no";"unknown";6;"may";23;1;-1;0;"unknown";"no"
+34;"blue-collar";"married";"secondary";"no";0;"no";"no";"unknown";6;"may";195;1;-1;0;"unknown";"no"
+28;"services";"single";"secondary";"no";5090;"yes";"no";"unknown";6;"may";1297;3;-1;0;"unknown";"yes"
+38;"technician";"married";"unknown";"no";573;"yes";"no";"unknown";6;"may";87;1;-1;0;"unknown";"no"
+56;"blue-collar";"married";"secondary";"no";1602;"yes";"no";"unknown";6;"may";427;1;-1;0;"unknown";"no"
+41;"blue-collar";"single";"primary";"yes";-137;"yes";"yes";"unknown";6;"may";189;1;-1;0;"unknown";"no"
+52;"technician";"married";"unknown";"no";0;"no";"no";"unknown";6;"may";195;1;-1;0;"unknown";"no"
+54;"services";"married";"secondary";"no";193;"no";"no";"unknown";6;"may";179;1;-1;0;"unknown";"no"
+61;"retired";"married";"secondary";"no";195;"yes";"yes";"unknown";6;"may";179;1;-1;0;"unknown";"no"
+53;"entrepreneur";"married";"secondary";"no";288;"no";"no";"unknown";6;"may";69;1;-1;0;"unknown";"no"
+47;"technician";"married";"secondary";"no";19;"yes";"no";"unknown";6;"may";105;2;-1;0;"unknown";"no"
+53;"blue-collar";"married";"primary";"no";25;"yes";"no";"unknown";6;"may";266;3;-1;0;"unknown";"no"
+46;"services";"married";"secondary";"no";216;"yes";"no";"unknown";6;"may";524;2;-1;0;"unknown";"no"
+39;"blue-collar";"divorced";"primary";"no";190;"yes";"yes";"unknown";6;"may";96;2;-1;0;"unknown";"no"
+56;"technician";"divorced";"secondary";"no";99;"yes";"no";"unknown";6;"may";155;2;-1;0;"unknown";"no"
+55;"services";"divorced";"primary";"no";2298;"yes";"no";"unknown";6;"may";162;2;-1;0;"unknown";"no"
+44;"management";"married";"tertiary";"no";17;"yes";"no";"unknown";6;"may";352;2;-1;0;"unknown";"no"
+37;"technician";"married";"primary";"no";0;"yes";"no";"unknown";6;"may";76;4;-1;0;"unknown";"no"
+35;"blue-collar";"married";"primary";"no";0;"yes";"no";"unknown";6;"may";154;2;-1;0;"unknown";"no"
+55;"blue-collar";"married";"secondary";"no";840;"yes";"no";"unknown";6;"may";310;2;-1;0;"unknown";"no"
+37;"services";"married";"secondary";"no";358;"yes";"no";"unknown";6;"may";390;3;-1;0;"unknown";"no"
+30;"technician";"single";"secondary";"no";0;"yes";"no";"unknown";6;"may";369;1;-1;0;"unknown";"no"
+37;"blue-collar";"married";"primary";"no";-325;"yes";"yes";"unknown";6;"may";112;2;-1;0;"unknown";"no"
+36;"technician";"single";"secondary";"no";-15;"yes";"no";"unknown";6;"may";341;3;-1;0;"unknown";"no"
+38;"technician";"married";"secondary";"no";581;"yes";"no";"unknown";6;"may";79;1;-1;0;"unknown";"no"
+41;"admin.";"divorced";"primary";"no";4070;"yes";"no";"unknown";6;"may";140;2;-1;0;"unknown";"no"
+48;"retired";"married";"secondary";"no";74;"no";"yes";"unknown";6;"may";315;1;-1;0;"unknown";"no"
+55;"services";"divorced";"secondary";"no";141;"yes";"no";"unknown";6;"may";262;2;-1;0;"unknown";"no"
+28;"services";"divorced";"secondary";"no";89;"no";"no";"unknown";6;"may";174;2;-1;0;"unknown";"no"
+54;"services";"married";"secondary";"yes";0;"yes";"no";"unknown";6;"may";138;3;-1;0;"unknown";"no"
+30;"blue-collar";"married";"secondary";"no";450;"no";"no";"unknown";6;"may";526;2;-1;0;"unknown";"no"
+48;"technician";"married";"tertiary";"no";310;"no";"no";"unknown";6;"may";135;1;-1;0;"unknown";"no"
+31;"self-employed";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";36;5;-1;0;"unknown";"no"
+38;"blue-collar";"married";"secondary";"no";384;"yes";"no";"unknown";6;"may";1906;3;-1;0;"unknown";"no"
+37;"blue-collar";"married";"secondary";"no";395;"yes";"no";"unknown";6;"may";219;2;-1;0;"unknown";"no"
+37;"services";"single";"unknown";"no";-118;"yes";"no";"unknown";6;"may";147;2;-1;0;"unknown";"no"
+56;"blue-collar";"married";"primary";"no";5;"yes";"yes";"unknown";6;"may";407;2;-1;0;"unknown";"no"
+51;"blue-collar";"married";"secondary";"no";50;"yes";"yes";"unknown";6;"may";121;1;-1;0;"unknown";"no"
+39;"blue-collar";"married";"secondary";"no";285;"yes";"yes";"unknown";6;"may";209;1;-1;0;"unknown";"no"
+49;"technician";"married";"unknown";"no";15;"no";"no";"unknown";6;"may";92;2;-1;0;"unknown";"no"
+51;"blue-collar";"married";"primary";"no";653;"yes";"yes";"unknown";6;"may";208;1;-1;0;"unknown";"no"
+43;"self-employed";"married";"secondary";"no";918;"yes";"no";"unknown";6;"may";193;1;-1;0;"unknown";"no"
+32;"services";"married";"secondary";"no";243;"yes";"yes";"unknown";6;"may";144;1;-1;0;"unknown";"no"
+29;"technician";"single";"tertiary";"no";405;"yes";"no";"unknown";6;"may";65;1;-1;0;"unknown";"no"
+48;"management";"divorced";"tertiary";"no";1328;"yes";"no";"unknown";6;"may";339;1;-1;0;"unknown";"no"
+55;"services";"married";"primary";"no";255;"yes";"no";"unknown";6;"may";285;1;-1;0;"unknown";"no"
+53;"blue-collar";"married";"secondary";"no";3397;"yes";"no";"unknown";6;"may";231;1;-1;0;"unknown";"no"
+47;"technician";"married";"unknown";"no";2106;"yes";"no";"unknown";6;"may";168;1;-1;0;"unknown";"no"
+39;"management";"married";"tertiary";"no";2877;"yes";"no";"unknown";6;"may";278;1;-1;0;"unknown";"no"
+31;"blue-collar";"single";"tertiary";"no";60;"yes";"yes";"unknown";6;"may";389;1;-1;0;"unknown";"no"
+39;"blue-collar";"married";"primary";"no";2226;"yes";"no";"unknown";6;"may";158;1;-1;0;"unknown";"no"
+40;"blue-collar";"married";"primary";"no";2880;"yes";"no";"unknown";6;"may";145;2;-1;0;"unknown";"no"
+40;"technician";"single";"unknown";"no";-5;"yes";"no";"unknown";6;"may";78;2;-1;0;"unknown";"no"
+48;"technician";"married";"secondary";"no";147;"no";"no";"unknown";6;"may";142;3;-1;0;"unknown";"no"
+33;"technician";"divorced";"secondary";"no";7;"yes";"yes";"unknown";6;"may";87;1;-1;0;"unknown";"no"
+40;"technician";"married";"secondary";"no";109;"yes";"no";"unknown";6;"may";147;2;-1;0;"unknown";"no"
+59;"retired";"married";"primary";"no";-119;"yes";"no";"unknown";6;"may";289;1;-1;0;"unknown";"no"
+30;"technician";"married";"secondary";"no";484;"yes";"no";"unknown";6;"may";703;1;-1;0;"unknown";"yes"
+31;"management";"single";"tertiary";"no";1852;"yes";"no";"unknown";6;"may";170;3;-1;0;"unknown";"no"
+35;"unemployed";"married";"secondary";"no";533;"yes";"no";"unknown";6;"may";802;1;-1;0;"unknown";"no"
+54;"technician";"divorced";"secondary";"no";21;"yes";"no";"unknown";6;"may";381;2;-1;0;"unknown";"no"
+34;"admin.";"single";"unknown";"no";2434;"yes";"no";"unknown";6;"may";218;4;-1;0;"unknown";"no"
+32;"technician";"married";"secondary";"no";90;"yes";"yes";"unknown";6;"may";57;2;-1;0;"unknown";"no"
+56;"admin.";"divorced";"unknown";"no";4246;"yes";"no";"unknown";6;"may";304;2;-1;0;"unknown";"no"
+32;"admin.";"single";"tertiary";"no";395;"yes";"no";"unknown";6;"may";241;3;-1;0;"unknown";"no"
+42;"blue-collar";"married";"primary";"no";15;"yes";"no";"unknown";6;"may";230;1;-1;0;"unknown";"no"
+33;"services";"married";"tertiary";"no";85;"no";"no";"unknown";6;"may";262;3;-1;0;"unknown";"no"
+52;"entrepreneur";"married";"tertiary";"no";-184;"yes";"yes";"unknown";6;"may";392;2;-1;0;"unknown";"no"
+52;"services";"married";"secondary";"no";660;"no";"no";"unknown";6;"may";201;2;-1;0;"unknown";"no"
+52;"blue-collar";"divorced";"primary";"yes";-183;"yes";"no";"unknown";6;"may";145;1;-1;0;"unknown";"no"
+30;"unemployed";"divorced";"secondary";"no";1144;"yes";"no";"unknown";6;"may";252;1;-1;0;"unknown";"no"
+44;"services";"divorced";"secondary";"no";1;"yes";"no";"unknown";6;"may";235;4;-1;0;"unknown";"no"
+35;"admin.";"married";"secondary";"no";69;"yes";"yes";"unknown";6;"may";235;2;-1;0;"unknown";"no"
+55;"management";"single";"secondary";"no";220;"yes";"no";"unknown";6;"may";328;2;-1;0;"unknown";"no"
+33;"blue-collar";"married";"primary";"no";332;"yes";"no";"unknown";6;"may";116;2;-1;0;"unknown";"no"
+37;"blue-collar";"single";"secondary";"no";240;"yes";"no";"unknown";6;"may";246;1;-1;0;"unknown";"no"
+42;"technician";"single";"secondary";"no";0;"yes";"no";"unknown";6;"may";293;1;-1;0;"unknown";"no"
+43;"unemployed";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";37;2;-1;0;"unknown";"no"
+38;"entrepreneur";"married";"tertiary";"no";898;"yes";"no";"unknown";6;"may";132;2;-1;0;"unknown";"no"
+37;"technician";"married";"secondary";"no";123;"yes";"yes";"unknown";6;"may";530;2;-1;0;"unknown";"no"
+31;"student";"single";"secondary";"no";252;"yes";"no";"unknown";6;"may";175;3;-1;0;"unknown";"no"
+41;"management";"married";"tertiary";"no";65;"yes";"no";"unknown";6;"may";524;2;-1;0;"unknown";"no"
+41;"technician";"married";"secondary";"no";-366;"yes";"yes";"unknown";6;"may";29;3;-1;0;"unknown";"no"
+29;"student";"single";"secondary";"no";209;"yes";"no";"unknown";6;"may";311;2;-1;0;"unknown";"no"
+38;"admin.";"single";"secondary";"no";221;"yes";"no";"unknown";6;"may";211;2;-1;0;"unknown";"no"
+44;"self-employed";"divorced";"tertiary";"no";4;"yes";"no";"unknown";6;"may";312;3;-1;0;"unknown";"no"
+39;"admin.";"married";"secondary";"no";104;"yes";"no";"unknown";6;"may";412;1;-1;0;"unknown";"no"
+28;"technician";"single";"secondary";"no";312;"yes";"no";"unknown";6;"may";392;1;-1;0;"unknown";"no"
+33;"blue-collar";"married";"secondary";"no";-349;"yes";"no";"unknown";6;"may";191;1;-1;0;"unknown";"no"
+41;"services";"married";"unknown";"no";4;"no";"no";"unknown";6;"may";284;2;-1;0;"unknown";"no"
+40;"blue-collar";"married";"primary";"no";-322;"yes";"yes";"unknown";6;"may";144;1;-1;0;"unknown";"no"
+29;"admin.";"married";"secondary";"no";-150;"yes";"no";"unknown";6;"may";328;1;-1;0;"unknown";"no"
+38;"management";"married";"unknown";"no";1349;"yes";"no";"unknown";6;"may";100;1;-1;0;"unknown";"no"
+32;"admin.";"married";"tertiary";"no";281;"yes";"no";"unknown";6;"may";226;1;-1;0;"unknown";"no"
+45;"services";"married";"secondary";"no";1259;"yes";"no";"unknown";6;"may";507;1;-1;0;"unknown";"no"
+33;"admin.";"single";"secondary";"no";101;"yes";"no";"unknown";6;"may";392;1;-1;0;"unknown";"no"
+34;"blue-collar";"married";"secondary";"no";848;"yes";"no";"unknown";6;"may";684;2;-1;0;"unknown";"no"
+41;"entrepreneur";"married";"unknown";"no";89;"yes";"no";"unknown";6;"may";333;2;-1;0;"unknown";"no"
+41;"blue-collar";"married";"secondary";"no";140;"yes";"no";"unknown";6;"may";311;3;-1;0;"unknown";"no"
+35;"admin.";"single";"secondary";"no";148;"yes";"no";"unknown";6;"may";128;2;-1;0;"unknown";"no"
+40;"technician";"single";"secondary";"no";200;"yes";"no";"unknown";6;"may";322;2;-1;0;"unknown";"no"
+60;"self-employed";"married";"primary";"no";46;"yes";"no";"unknown";6;"may";202;4;-1;0;"unknown";"no"
+47;"services";"divorced";"secondary";"no";201;"yes";"no";"unknown";6;"may";92;2;-1;0;"unknown";"no"
+46;"blue-collar";"married";"primary";"no";530;"yes";"no";"unknown";6;"may";739;3;-1;0;"unknown";"no"
+31;"blue-collar";"single";"secondary";"no";0;"yes";"no";"unknown";6;"may";273;2;-1;0;"unknown";"no"
+49;"self-employed";"married";"secondary";"no";1;"yes";"no";"unknown";6;"may";260;3;-1;0;"unknown";"no"
+29;"blue-collar";"married";"secondary";"no";43;"yes";"no";"unknown";6;"may";268;2;-1;0;"unknown";"no"
+31;"management";"single";"tertiary";"no";-173;"yes";"no";"unknown";6;"may";396;2;-1;0;"unknown";"no"
+38;"management";"married";"tertiary";"no";389;"yes";"no";"unknown";6;"may";262;1;-1;0;"unknown";"no"
+37;"blue-collar";"married";"primary";"no";215;"yes";"yes";"unknown";6;"may";308;3;-1;0;"unknown";"no"
+35;"technician";"married";"secondary";"no";-131;"yes";"no";"unknown";6;"may";467;2;-1;0;"unknown";"no"
+31;"management";"single";"secondary";"no";783;"yes";"no";"unknown";6;"may";320;1;-1;0;"unknown";"no"
+41;"admin.";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";160;3;-1;0;"unknown";"no"
+46;"services";"married";"unknown";"no";80;"yes";"no";"unknown";6;"may";245;2;-1;0;"unknown";"no"
+40;"services";"divorced";"secondary";"no";105;"yes";"no";"unknown";6;"may";189;2;-1;0;"unknown";"no"
+29;"admin.";"married";"secondary";"no";182;"yes";"yes";"unknown";6;"may";477;1;-1;0;"unknown";"no"
+49;"admin.";"married";"secondary";"no";82;"yes";"no";"unknown";6;"may";310;1;-1;0;"unknown";"no"
+42;"management";"married";"tertiary";"no";0;"yes";"no";"unknown";6;"may";65;3;-1;0;"unknown";"no"
+54;"services";"married";"secondary";"no";510;"yes";"no";"unknown";6;"may";196;2;-1;0;"unknown";"no"
+40;"management";"single";"tertiary";"no";242;"yes";"no";"unknown";6;"may";221;2;-1;0;"unknown";"no"
+53;"admin.";"married";"secondary";"no";244;"yes";"yes";"unknown";6;"may";197;2;-1;0;"unknown";"no"
+49;"management";"married";"tertiary";"no";92;"yes";"no";"unknown";6;"may";221;2;-1;0;"unknown";"no"
+40;"blue-collar";"married";"primary";"yes";0;"yes";"no";"unknown";6;"may";64;2;-1;0;"unknown";"no"
+29;"student";"single";"secondary";"no";948;"yes";"no";"unknown";6;"may";75;2;-1;0;"unknown";"no"
+36;"blue-collar";"married";"primary";"no";23;"yes";"no";"unknown";6;"may";400;2;-1;0;"unknown";"no"
+37;"technician";"married";"secondary";"no";710;"yes";"no";"unknown";6;"may";378;3;-1;0;"unknown";"no"
+39;"services";"married";"secondary";"no";1205;"yes";"no";"unknown";6;"may";118;2;-1;0;"unknown";"no"
+36;"technician";"married";"secondary";"no";368;"yes";"yes";"unknown";6;"may";1597;2;-1;0;"unknown";"yes"
+44;"entrepreneur";"married";"tertiary";"no";1631;"yes";"no";"unknown";6;"may";346;2;-1;0;"unknown";"no"
+40;"admin.";"married";"secondary";"no";6;"yes";"no";"unknown";6;"may";60;3;-1;0;"unknown";"no"
+49;"blue-collar";"married";"secondary";"no";26;"yes";"no";"unknown";6;"may";276;2;-1;0;"unknown";"no"
+30;"technician";"single";"unknown";"no";-48;"yes";"no";"unknown";6;"may";152;2;-1;0;"unknown";"no"
+57;"management";"married";"tertiary";"no";2142;"yes";"no";"unknown";6;"may";251;3;-1;0;"unknown";"no"
+24;"services";"single";"secondary";"no";77;"yes";"yes";"unknown";6;"may";390;2;-1;0;"unknown";"no"
+46;"blue-collar";"married";"unknown";"no";401;"yes";"no";"unknown";6;"may";306;2;-1;0;"unknown";"no"
+33;"admin.";"married";"secondary";"no";21;"no";"no";"unknown";6;"may";189;3;-1;0;"unknown";"no"
+43;"services";"divorced";"secondary";"no";0;"yes";"no";"unknown";6;"may";125;2;-1;0;"unknown";"no"
+43;"admin.";"single";"secondary";"no";-497;"yes";"no";"unknown";6;"may";234;2;-1;0;"unknown";"no"
+40;"blue-collar";"divorced";"primary";"no";369;"no";"no";"unknown";6;"may";79;2;-1;0;"unknown";"no"
+44;"technician";"single";"unknown";"no";78;"yes";"no";"unknown";6;"may";13;6;-1;0;"unknown";"no"
+35;"technician";"single";"tertiary";"no";226;"yes";"yes";"unknown";6;"may";283;3;-1;0;"unknown";"no"
+47;"technician";"married";"secondary";"no";503;"yes";"no";"unknown";6;"may";109;2;-1;0;"unknown";"no"
+33;"blue-collar";"married";"secondary";"no";372;"yes";"no";"unknown";6;"may";132;2;-1;0;"unknown";"no"
+31;"admin.";"married";"secondary";"no";0;"yes";"yes";"unknown";6;"may";144;2;-1;0;"unknown";"no"
+40;"blue-collar";"divorced";"secondary";"no";0;"yes";"no";"unknown";6;"may";121;2;-1;0;"unknown";"no"
+36;"entrepreneur";"married";"tertiary";"no";125;"yes";"no";"unknown";6;"may";95;3;-1;0;"unknown";"no"
+56;"retired";"divorced";"primary";"no";4;"yes";"no";"unknown";6;"may";31;3;-1;0;"unknown";"no"
+40;"admin.";"single";"unknown";"no";419;"yes";"no";"unknown";6;"may";112;3;-1;0;"unknown";"no"
+41;"admin.";"divorced";"secondary";"no";322;"yes";"no";"unknown";6;"may";87;4;-1;0;"unknown";"no"
+53;"retired";"married";"secondary";"no";303;"yes";"no";"unknown";6;"may";593;2;-1;0;"unknown";"no"
+39;"blue-collar";"married";"secondary";"no";607;"yes";"no";"unknown";6;"may";99;2;-1;0;"unknown";"no"
+44;"blue-collar";"divorced";"secondary";"no";579;"yes";"no";"unknown";6;"may";198;2;-1;0;"unknown";"no"
+38;"admin.";"married";"secondary";"no";3047;"yes";"no";"unknown";6;"may";285;2;-1;0;"unknown";"no"
+54;"technician";"divorced";"secondary";"no";83;"yes";"no";"unknown";6;"may";190;3;-1;0;"unknown";"no"
+58;"management";"married";"tertiary";"no";68;"yes";"no";"unknown";6;"may";172;5;-1;0;"unknown";"no"
+52;"blue-collar";"married";"primary";"no";58;"yes";"no";"unknown";6;"may";213;3;-1;0;"unknown";"no"
+28;"admin.";"single";"secondary";"no";251;"yes";"no";"unknown";6;"may";178;2;-1;0;"unknown";"no"
+36;"blue-collar";"married";"secondary";"no";688;"yes";"no";"unknown";6;"may";174;2;-1;0;"unknown";"no"
+60;"retired";"married";"primary";"no";364;"yes";"no";"unknown";6;"may";631;2;-1;0;"unknown";"no"
+42;"services";"divorced";"secondary";"no";55;"yes";"no";"unknown";6;"may";176;5;-1;0;"unknown";"no"
+42;"admin.";"married";"secondary";"no";101;"yes";"no";"unknown";6;"may";32;3;-1;0;"unknown";"no"
+44;"management";"married";"tertiary";"no";105;"yes";"no";"unknown";6;"may";1529;2;-1;0;"unknown";"no"
+51;"blue-collar";"divorced";"primary";"no";325;"yes";"no";"unknown";6;"may";254;2;-1;0;"unknown";"no"
+49;"blue-collar";"married";"primary";"no";198;"yes";"no";"unknown";6;"may";200;2;-1;0;"unknown";"no"
+47;"entrepreneur";"married";"unknown";"no";209;"yes";"no";"unknown";6;"may";135;2;-1;0;"unknown";"no"
+37;"blue-collar";"married";"secondary";"no";183;"yes";"no";"unknown";6;"may";112;4;-1;0;"unknown";"no"
+34;"management";"married";"tertiary";"no";105;"yes";"no";"unknown";6;"may";314;3;-1;0;"unknown";"no"
+35;"services";"married";"secondary";"no";109;"yes";"no";"unknown";6;"may";597;3;-1;0;"unknown";"no"
+35;"blue-collar";"single";"secondary";"no";376;"yes";"yes";"unknown";6;"may";207;3;-1;0;"unknown";"no"
+40;"blue-collar";"married";"primary";"no";-7;"yes";"no";"unknown";6;"may";410;2;-1;0;"unknown";"no"
+55;"technician";"married";"secondary";"no";0;"no";"no";"unknown";6;"may";160;3;-1;0;"unknown";"no"
+55;"retired";"married";"secondary";"no";143;"yes";"no";"unknown";6;"may";42;3;-1;0;"unknown";"no"
+35;"management";"single";"tertiary";"no";550;"yes";"no";"unknown";6;"may";55;2;-1;0;"unknown";"no"
+57;"blue-collar";"married";"primary";"no";162;"yes";"no";"unknown";6;"may";155;2;-1;0;"unknown";"no"
+53;"management";"married";"tertiary";"no";115;"yes";"no";"unknown";6;"may";336;3;-1;0;"unknown";"no"
+41;"blue-collar";"married";"primary";"no";512;"yes";"no";"unknown";6;"may";233;2;-1;0;"unknown";"no"
+57;"blue-collar";"married";"unknown";"no";807;"yes";"no";"unknown";6;"may";211;2;-1;0;"unknown";"no"
+45;"blue-collar";"married";"unknown";"no";248;"yes";"no";"unknown";6;"may";88;5;-1;0;"unknown";"no"
+43;"blue-collar";"married";"primary";"no";1211;"yes";"no";"unknown";6;"may";208;3;-1;0;"unknown";"no"
+56;"self-employed";"married";"unknown";"no";7;"no";"no";"unknown";6;"may";305;2;-1;0;"unknown";"no"
+31;"entrepreneur";"married";"tertiary";"no";281;"yes";"no";"unknown";6;"may";206;2;-1;0;"unknown";"no"
+37;"blue-collar";"single";"secondary";"no";88;"yes";"no";"unknown";6;"may";128;2;-1;0;"unknown";"no"
+30;"management";"married";"tertiary";"no";32;"yes";"no";"unknown";6;"may";122;3;-1;0;"unknown";"no"
+30;"admin.";"single";"secondary";"no";115;"yes";"no";"unknown";6;"may";66;3;-1;0;"unknown";"no"
+54;"blue-collar";"married";"secondary";"no";254;"yes";"no";"unknown";6;"may";66;2;-1;0;"unknown";"no"
+36;"management";"married";"tertiary";"no";144;"yes";"no";"unknown";6;"may";164;2;-1;0;"unknown";"no"
+55;"unemployed";"married";"tertiary";"no";383;"no";"no";"unknown";6;"may";343;3;-1;0;"unknown";"no"
+37;"admin.";"single";"secondary";"no";569;"yes";"yes";"unknown";6;"may";126;2;-1;0;"unknown";"no"
+38;"housemaid";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";59;3;-1;0;"unknown";"no"
+48;"admin.";"married";"secondary";"no";3754;"yes";"no";"unknown";6;"may";249;3;-1;0;"unknown";"no"
+55;"housemaid";"divorced";"tertiary";"no";6920;"yes";"no";"unknown";6;"may";406;3;-1;0;"unknown";"no"
+59;"services";"married";"secondary";"no";307;"yes";"yes";"unknown";6;"may";250;7;-1;0;"unknown";"no"
+37;"technician";"married";"secondary";"no";-421;"yes";"no";"unknown";6;"may";183;5;-1;0;"unknown";"no"
+33;"blue-collar";"divorced";"secondary";"no";60;"no";"no";"unknown";6;"may";190;3;-1;0;"unknown";"no"
+44;"blue-collar";"married";"primary";"no";67;"yes";"no";"unknown";6;"may";220;2;-1;0;"unknown";"no"
+57;"technician";"married";"secondary";"no";402;"yes";"no";"unknown";6;"may";153;3;-1;0;"unknown";"no"
+30;"self-employed";"single";"tertiary";"no";800;"no";"no";"unknown";6;"may";95;2;-1;0;"unknown";"no"
+42;"technician";"married";"tertiary";"no";239;"yes";"yes";"unknown";6;"may";191;3;-1;0;"unknown";"no"
+51;"blue-collar";"divorced";"secondary";"no";421;"yes";"no";"unknown";6;"may";216;2;-1;0;"unknown";"no"
+44;"admin.";"divorced";"secondary";"no";161;"yes";"no";"unknown";7;"may";89;2;-1;0;"unknown";"no"
+46;"technician";"married";"secondary";"yes";289;"no";"no";"unknown";7;"may";51;3;-1;0;"unknown";"no"
+29;"student";"single";"secondary";"no";110;"yes";"no";"unknown";7;"may";169;3;-1;0;"unknown";"no"
+39;"blue-collar";"married";"primary";"no";245;"yes";"no";"unknown";7;"may";148;3;-1;0;"unknown";"no"
+42;"services";"married";"secondary";"no";0;"yes";"no";"unknown";7;"may";132;3;-1;0;"unknown";"no"
+50;"blue-collar";"married";"primary";"no";156;"yes";"no";"unknown";7;"may";117;3;-1;0;"unknown";"no"
+42;"technician";"single";"secondary";"no";0;"yes";"no";"unknown";7;"may";275;4;-1;0;"unknown";"no"
+39;"admin.";"married";"secondary";"no";20;"yes";"no";"unknown";7;"may";124;2;-1;0;"unknown";"no"
+55;"technician";"single";"tertiary";"no";92;"yes";"no";"unknown";7;"may";118;3;-1;0;"unknown";"no"
+46;"services";"married";"secondary";"no";89;"yes";"no";"unknown";7;"may";479;2;-1;0;"unknown";"no"
+42;"blue-collar";"married";"secondary";"no";166;"yes";"no";"unknown";7;"may";285;3;-1;0;"unknown";"no"
+45;"management";"married";"tertiary";"no";103;"yes";"no";"unknown";7;"may";35;4;-1;0;"unknown";"no"
+43;"blue-collar";"married";"primary";"no";-454;"yes";"no";"unknown";7;"may";322;2;-1;0;"unknown";"no"
+42;"admin.";"married";"secondary";"no";445;"yes";"no";"unknown";7;"may";202;2;-1;0;"unknown";"no"
+30;"admin.";"married";"secondary";"no";4;"no";"no";"unknown";7;"may";172;8;-1;0;"unknown";"no"
+47;"blue-collar";"married";"secondary";"no";1001;"yes";"no";"unknown";7;"may";201;4;-1;0;"unknown";"no"
+51;"services";"divorced";"secondary";"no";-69;"yes";"no";"unknown";7;"may";216;3;-1;0;"unknown";"no"
+38;"technician";"single";"secondary";"no";42;"yes";"no";"unknown";7;"may";195;2;-1;0;"unknown";"no"
+57;"technician";"married";"unknown";"no";1617;"yes";"no";"unknown";7;"may";96;2;-1;0;"unknown";"no"
+42;"management";"divorced";"tertiary";"no";221;"yes";"no";"unknown";7;"may";720;2;-1;0;"unknown";"no"
+32;"technician";"divorced";"secondary";"no";210;"yes";"yes";"unknown";7;"may";188;2;-1;0;"unknown";"no"
+46;"management";"married";"tertiary";"no";0;"no";"no";"unknown";7;"may";70;2;-1;0;"unknown";"no"
+29;"student";"single";"tertiary";"no";185;"yes";"no";"unknown";7;"may";141;3;-1;0;"unknown";"no"
+59;"retired";"married";"secondary";"no";836;"yes";"no";"unknown";7;"may";106;1;-1;0;"unknown";"no"
+32;"blue-collar";"single";"secondary";"no";301;"yes";"no";"unknown";7;"may";395;2;-1;0;"unknown";"no"
+44;"blue-collar";"married";"primary";"no";503;"yes";"no";"unknown";7;"may";629;2;-1;0;"unknown";"no"
+40;"retired";"married";"primary";"no";407;"yes";"no";"unknown";7;"may";502;1;-1;0;"unknown";"no"
+31;"blue-collar";"single";"secondary";"no";53;"yes";"no";"unknown";7;"may";446;1;-1;0;"unknown";"no"
+46;"self-employed";"married";"tertiary";"no";2303;"yes";"no";"unknown";7;"may";241;1;-1;0;"unknown";"no"
+43;"management";"married";"tertiary";"no";144;"yes";"no";"unknown";7;"may";131;3;-1;0;"unknown";"no"
+34;"services";"married";"secondary";"no";205;"yes";"no";"unknown";7;"may";312;1;-1;0;"unknown";"no"
+39;"management";"married";"tertiary";"no";305;"yes";"no";"unknown";7;"may";275;6;-1;0;"unknown";"no"
+30;"blue-collar";"divorced";"secondary";"no";251;"yes";"yes";"unknown";7;"may";120;2;-1;0;"unknown";"no"
+56;"retired";"married";"secondary";"no";0;"yes";"no";"unknown";7;"may";333;4;-1;0;"unknown";"no"
+29;"technician";"married";"secondary";"no";8;"no";"no";"unknown";7;"may";113;1;-1;0;"unknown";"no"
+40;"blue-collar";"divorced";"secondary";"no";139;"yes";"no";"unknown";7;"may";91;1;-1;0;"unknown";"no"
+36;"services";"married";"secondary";"no";184;"yes";"no";"unknown";7;"may";128;3;-1;0;"unknown";"no"
+37;"blue-collar";"single";"secondary";"no";238;"yes";"no";"unknown";7;"may";200;2;-1;0;"unknown";"no"
+35;"admin.";"married";"secondary";"no";0;"no";"no";"unknown";7;"may";326;1;-1;0;"unknown";"no"
+35;"blue-collar";"married";"primary";"yes";0;"yes";"no";"unknown";7;"may";292;1;-1;0;"unknown";"no"
+47;"services";"married";"primary";"no";222;"yes";"no";"unknown";7;"may";68;1;-1;0;"unknown";"no"
+31;"services";"married";"secondary";"no";414;"yes";"no";"unknown";7;"may";215;1;-1;0;"unknown";"no"
+56;"retired";"single";"primary";"no";223;"yes";"no";"unknown";7;"may";97;1;-1;0;"unknown";"no"
+57;"technician";"married";"secondary";"no";197;"no";"no";"unknown";7;"may";32;1;-1;0;"unknown";"no"
+36;"blue-collar";"married";"secondary";"no";-251;"yes";"no";"unknown";7;"may";162;1;-1;0;"unknown";"no"
+45;"self-employed";"divorced";"secondary";"no";-139;"yes";"no";"unknown";7;"may";152;3;-1;0;"unknown";"no"
+47;"blue-collar";"married";"unknown";"no";733;"yes";"no";"unknown";7;"may";268;1;-1;0;"unknown";"no"
+29;"technician";"single";"tertiary";"no";0;"yes";"no";"unknown";7;"may";104;2;-1;0;"unknown";"no"
+57;"services";"married";"secondary";"no";1;"no";"no";"unknown";7;"may";852;1;-1;0;"unknown";"no"
+45;"blue-collar";"married";"primary";"no";97;"yes";"no";"unknown";7;"may";923;3;-1;0;"unknown";"no"
+31;"blue-collar";"single";"primary";"no";435;"yes";"no";"unknown";7;"may";159;2;-1;0;"unknown";"no"
+31;"management";"divorced";"tertiary";"no";0;"yes";"no";"unknown";7;"may";953;3;-1;0;"unknown";"no"
+37;"technician";"single";"tertiary";"no";147;"no";"no";"unknown";7;"may";416;2;-1;0;"unknown";"no"
+30;"technician";"single";"tertiary";"no";3;"yes";"no";"unknown";7;"may";174;1;-1;0;"unknown";"no"
+58;"services";"divorced";"secondary";"no";1109;"yes";"yes";"unknown";7;"may";180;1;-1;0;"unknown";"no"
+33;"services";"married";"secondary";"no";404;"yes";"no";"unknown";7;"may";139;1;-1;0;"unknown";"no"
+39;"blue-collar";"married";"primary";"no";981;"yes";"no";"unknown";7;"may";294;1;-1;0;"unknown";"no"
+33;"blue-collar";"single";"primary";"no";95;"yes";"no";"unknown";7;"may";102;1;-1;0;"unknown";"no"
+34;"services";"married";"secondary";"no";302;"yes";"no";"unknown";7;"may";124;1;-1;0;"unknown";"no"
+36;"services";"divorced";"secondary";"no";-290;"yes";"yes";"unknown";7;"may";128;1;-1;0;"unknown";"no"
+37;"services";"single";"secondary";"no";259;"yes";"no";"unknown";7;"may";130;1;-1;0;"unknown";"no"
+35;"blue-collar";"married";"secondary";"no";527;"yes";"yes";"unknown";7;"may";143;1;-1;0;"unknown";"no"
+55;"retired";"married";"secondary";"no";102;"yes";"no";"unknown";7;"may";74;1;-1;0;"unknown";"no"
+34;"management";"single";"tertiary";"no";872;"yes";"no";"unknown";7;"may";105;2;-1;0;"unknown";"no"
+40;"management";"divorced";"tertiary";"no";490;"yes";"no";"unknown";7;"may";477;2;-1;0;"unknown";"no"
+42;"blue-collar";"single";"primary";"no";19;"yes";"no";"unknown";7;"may";158;1;-1;0;"unknown";"no"
+37;"blue-collar";"married";"secondary";"no";16;"yes";"no";"unknown";7;"may";250;1;-1;0;"unknown";"no"
+42;"management";"married";"tertiary";"no";386;"yes";"no";"unknown";7;"may";168;1;-1;0;"unknown";"no"
+35;"technician";"single";"secondary";"no";539;"yes";"no";"unknown";7;"may";520;1;-1;0;"unknown";"no"
+44;"technician";"divorced";"secondary";"no";-329;"yes";"no";"unknown";7;"may";171;1;-1;0;"unknown";"no"
+30;"services";"single";"secondary";"no";-174;"yes";"no";"unknown";7;"may";113;1;-1;0;"unknown";"no"
+45;"entrepreneur";"married";"secondary";"no";68;"yes";"no";"unknown";7;"may";254;1;-1;0;"unknown";"no"
+35;"blue-collar";"single";"unknown";"yes";-532;"yes";"no";"unknown";7;"may";149;1;-1;0;"unknown";"no"
+36;"admin.";"divorced";"secondary";"no";0;"yes";"no";"unknown";7;"may";133;2;-1;0;"unknown";"no"
+49;"blue-collar";"married";"secondary";"no";64;"yes";"no";"unknown";7;"may";293;3;-1;0;"unknown";"no"
+31;"blue-collar";"single";"secondary";"no";1415;"yes";"no";"unknown";7;"may";485;1;-1;0;"unknown";"no"
+31;"technician";"single";"secondary";"no";147;"yes";"no";"unknown";7;"may";374;1;-1;0;"unknown";"no"
+39;"blue-collar";"married";"secondary";"no";72;"yes";"no";"unknown";7;"may";425;6;-1;0;"unknown";"no"
+37;"services";"single";"secondary";"no";-196;"yes";"no";"unknown";7;"may";207;1;-1;0;"unknown";"no"
+33;"blue-collar";"married";"primary";"no";716;"yes";"no";"unknown";7;"may";83;3;-1;0;"unknown";"no"
+37;"management";"married";"tertiary";"no";0;"yes";"no";"unknown";7;"may";228;1;-1;0;"unknown";"no"
+42;"services";"married";"secondary";"no";-246;"yes";"no";"unknown";7;"may";149;1;-1;0;"unknown";"no"
+56;"blue-collar";"married";"secondary";"no";-203;"yes";"no";"unknown";7;"may";139;1;-1;0;"unknown";"no"
+37;"admin.";"single";"secondary";"no";245;"yes";"yes";"unknown";7;"may";732;2;-1;0;"unknown";"yes"
+36;"services";"single";"secondary";"no";342;"yes";"no";"unknown";7;"may";142;1;-1;0;"unknown";"no"
+29;"technician";"single";"tertiary";"no";3;"yes";"no";"unknown";7;"may";359;1;-1;0;"unknown";"no"
+54;"management";"married";"tertiary";"yes";-248;"yes";"yes";"unknown";7;"may";112;1;-1;0;"unknown";"no"
+38;"blue-collar";"married";"secondary";"no";376;"yes";"no";"unknown";7;"may";1521;1;-1;0;"unknown";"no"
+43;"blue-collar";"divorced";"secondary";"no";370;"yes";"no";"unknown";7;"may";216;1;-1;0;"unknown";"no"
+47;"admin.";"single";"secondary";"no";594;"yes";"no";"unknown";7;"may";161;1;-1;0;"unknown";"no"
+47;"blue-collar";"married";"secondary";"no";387;"yes";"no";"unknown";7;"may";122;2;-1;0;"unknown";"no"
+38;"services";"married";"secondary";"no";208;"yes";"no";"unknown";7;"may";800;1;-1;0;"unknown";"no"
+40;"blue-collar";"married";"secondary";"no";563;"yes";"no";"unknown";7;"may";615;1;-1;0;"unknown";"no"
+33;"services";"divorced";"secondary";"no";392;"yes";"yes";"unknown";7;"may";254;1;-1;0;"unknown";"no"
+33;"retired";"married";"secondary";"no";165;"no";"no";"unknown";7;"may";111;1;-1;0;"unknown";"no"
+53;"admin.";"divorced";"unknown";"no";236;"yes";"no";"unknown";7;"may";354;1;-1;0;"unknown";"no"
+37;"services";"married";"primary";"no";52;"yes";"no";"unknown";7;"may";359;1;-1;0;"unknown";"no"
+40;"management";"single";"tertiary";"no";1265;"yes";"no";"unknown";7;"may";97;1;-1;0;"unknown";"no"
+37;"blue-collar";"married";"primary";"no";693;"yes";"no";"unknown";7;"may";327;3;-1;0;"unknown";"no"
+35;"technician";"married";"secondary";"no";118;"yes";"no";"unknown";7;"may";236;1;-1;0;"unknown";"no"
+49;"blue-collar";"married";"primary";"no";3659;"yes";"no";"unknown";7;"may";160;1;-1;0;"unknown";"no"
+26;"blue-collar";"single";"secondary";"no";24;"yes";"no";"unknown";7;"may";180;1;-1;0;"unknown";"no"
+38;"management";"single";"tertiary";"no";673;"yes";"no";"unknown";7;"may";184;1;-1;0;"unknown";"no"
+52;"self-employed";"married";"secondary";"no";273;"no";"no";"unknown";7;"may";227;1;-1;0;"unknown";"no"
+33;"services";"divorced";"secondary";"no";327;"yes";"no";"unknown";7;"may";109;1;-1;0;"unknown";"no"
+31;"admin.";"single";"secondary";"no";299;"yes";"no";"unknown";7;"may";492;2;-1;0;"unknown";"no"
+32;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";7;"may";298;1;-1;0;"unknown";"no"
+35;"blue-collar";"single";"primary";"no";109;"yes";"no";"unknown";7;"may";83;2;-1;0;"unknown";"no"
+55;"management";"divorced";"tertiary";"no";552;"no";"no";"unknown";7;"may";241;2;-1;0;"unknown";"no"
+32;"blue-collar";"divorced";"primary";"no";473;"yes";"no";"unknown";7;"may";204;2;-1;0;"unknown";"no"
+37;"unknown";"single";"unknown";"no";414;"yes";"no";"unknown";7;"may";131;1;-1;0;"unknown";"no"
+45;"blue-collar";"married";"secondary";

<TRUNCATED>
r***@apache.org
2018-06-28 14:55:08 UTC
Permalink
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/test/resources/wdbc/wdbc.data
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/test/resources/wdbc/wdbc.data b/community/mahout-mr/examples/src/test/resources/wdbc/wdbc.data
deleted file mode 100644
index 8885375..0000000
--- a/community/mahout-mr/examples/src/test/resources/wdbc/wdbc.data
+++ /dev/null
@@ -1,569 +0,0 @@
-842302,M,17.99,10.38,122.8,1001,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
-842517,M,20.57,17.77,132.9,1326,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0.5435,0.7339,3.398,74.08,0.005225,0.01308,0.0186,0.0134,0.01389,0.003532,24.99,23.41,158.8,1956,0.1238,0.1866,0.2416,0.186,0.275,0.08902
-84300903,M,19.69,21.25,130,1203,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0.7456,0.7869,4.585,94.03,0.00615,0.04006,0.03832,0.02058,0.0225,0.004571,23.57,25.53,152.5,1709,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
-84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,0.4956,1.156,3.445,27.23,0.00911,0.07458,0.05661,0.01867,0.05963,0.009208,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
-84358402,M,20.29,14.34,135.1,1297,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,0.7572,0.7813,5.438,94.44,0.01149,0.02461,0.05688,0.01885,0.01756,0.005115,22.54,16.67,152.2,1575,0.1374,0.205,0.4,0.1625,0.2364,0.07678
-843786,M,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.08089,0.2087,0.07613,0.3345,0.8902,2.217,27.19,0.00751,0.03345,0.03672,0.01137,0.02165,0.005082,15.47,23.75,103.4,741.6,0.1791,0.5249,0.5355,0.1741,0.3985,0.1244
-844359,M,18.25,19.98,119.6,1040,0.09463,0.109,0.1127,0.074,0.1794,0.05742,0.4467,0.7732,3.18,53.91,0.004314,0.01382,0.02254,0.01039,0.01369,0.002179,22.88,27.66,153.2,1606,0.1442,0.2576,0.3784,0.1932,0.3063,0.08368
-84458202,M,13.71,20.83,90.2,577.9,0.1189,0.1645,0.09366,0.05985,0.2196,0.07451,0.5835,1.377,3.856,50.96,0.008805,0.03029,0.02488,0.01448,0.01486,0.005412,17.06,28.14,110.6,897,0.1654,0.3682,0.2678,0.1556,0.3196,0.1151
-844981,M,13,21.82,87.5,519.8,0.1273,0.1932,0.1859,0.09353,0.235,0.07389,0.3063,1.002,2.406,24.32,0.005731,0.03502,0.03553,0.01226,0.02143,0.003749,15.49,30.73,106.2,739.3,0.1703,0.5401,0.539,0.206,0.4378,0.1072
-84501001,M,12.46,24.04,83.97,475.9,0.1186,0.2396,0.2273,0.08543,0.203,0.08243,0.2976,1.599,2.039,23.94,0.007149,0.07217,0.07743,0.01432,0.01789,0.01008,15.09,40.68,97.65,711.4,0.1853,1.058,1.105,0.221,0.4366,0.2075
-845636,M,16.02,23.24,102.7,797.8,0.08206,0.06669,0.03299,0.03323,0.1528,0.05697,0.3795,1.187,2.466,40.51,0.004029,0.009269,0.01101,0.007591,0.0146,0.003042,19.19,33.88,123.8,1150,0.1181,0.1551,0.1459,0.09975,0.2948,0.08452
-84610002,M,15.78,17.89,103.6,781,0.0971,0.1292,0.09954,0.06606,0.1842,0.06082,0.5058,0.9849,3.564,54.16,0.005771,0.04061,0.02791,0.01282,0.02008,0.004144,20.42,27.28,136.5,1299,0.1396,0.5609,0.3965,0.181,0.3792,0.1048
-846226,M,19.17,24.8,132.4,1123,0.0974,0.2458,0.2065,0.1118,0.2397,0.078,0.9555,3.568,11.07,116.2,0.003139,0.08297,0.0889,0.0409,0.04484,0.01284,20.96,29.94,151.7,1332,0.1037,0.3903,0.3639,0.1767,0.3176,0.1023
-846381,M,15.85,23.95,103.7,782.7,0.08401,0.1002,0.09938,0.05364,0.1847,0.05338,0.4033,1.078,2.903,36.58,0.009769,0.03126,0.05051,0.01992,0.02981,0.003002,16.84,27.66,112,876.5,0.1131,0.1924,0.2322,0.1119,0.2809,0.06287
-84667401,M,13.73,22.61,93.6,578.3,0.1131,0.2293,0.2128,0.08025,0.2069,0.07682,0.2121,1.169,2.061,19.21,0.006429,0.05936,0.05501,0.01628,0.01961,0.008093,15.03,32.01,108.8,697.7,0.1651,0.7725,0.6943,0.2208,0.3596,0.1431
-84799002,M,14.54,27.54,96.73,658.8,0.1139,0.1595,0.1639,0.07364,0.2303,0.07077,0.37,1.033,2.879,32.55,0.005607,0.0424,0.04741,0.0109,0.01857,0.005466,17.46,37.13,124.1,943.2,0.1678,0.6577,0.7026,0.1712,0.4218,0.1341
-848406,M,14.68,20.13,94.74,684.5,0.09867,0.072,0.07395,0.05259,0.1586,0.05922,0.4727,1.24,3.195,45.4,0.005718,0.01162,0.01998,0.01109,0.0141,0.002085,19.07,30.88,123.4,1138,0.1464,0.1871,0.2914,0.1609,0.3029,0.08216
-84862001,M,16.13,20.68,108.1,798.8,0.117,0.2022,0.1722,0.1028,0.2164,0.07356,0.5692,1.073,3.854,54.18,0.007026,0.02501,0.03188,0.01297,0.01689,0.004142,20.96,31.48,136.8,1315,0.1789,0.4233,0.4784,0.2073,0.3706,0.1142
-849014,M,19.81,22.15,130,1260,0.09831,0.1027,0.1479,0.09498,0.1582,0.05395,0.7582,1.017,5.865,112.4,0.006494,0.01893,0.03391,0.01521,0.01356,0.001997,27.32,30.88,186.8,2398,0.1512,0.315,0.5372,0.2388,0.2768,0.07615
-8510426,B,13.54,14.36,87.46,566.3,0.09779,0.08129,0.06664,0.04781,0.1885,0.05766,0.2699,0.7886,2.058,23.56,0.008462,0.0146,0.02387,0.01315,0.0198,0.0023,15.11,19.26,99.7,711.2,0.144,0.1773,0.239,0.1288,0.2977,0.07259
-8510653,B,13.08,15.71,85.63,520,0.1075,0.127,0.04568,0.0311,0.1967,0.06811,0.1852,0.7477,1.383,14.67,0.004097,0.01898,0.01698,0.00649,0.01678,0.002425,14.5,20.49,96.09,630.5,0.1312,0.2776,0.189,0.07283,0.3184,0.08183
-8510824,B,9.504,12.44,60.34,273.9,0.1024,0.06492,0.02956,0.02076,0.1815,0.06905,0.2773,0.9768,1.909,15.7,0.009606,0.01432,0.01985,0.01421,0.02027,0.002968,10.23,15.66,65.13,314.9,0.1324,0.1148,0.08867,0.06227,0.245,0.07773
-8511133,M,15.34,14.26,102.5,704.4,0.1073,0.2135,0.2077,0.09756,0.2521,0.07032,0.4388,0.7096,3.384,44.91,0.006789,0.05328,0.06446,0.02252,0.03672,0.004394,18.07,19.08,125.1,980.9,0.139,0.5954,0.6305,0.2393,0.4667,0.09946
-851509,M,21.16,23.04,137.2,1404,0.09428,0.1022,0.1097,0.08632,0.1769,0.05278,0.6917,1.127,4.303,93.99,0.004728,0.01259,0.01715,0.01038,0.01083,0.001987,29.17,35.59,188,2615,0.1401,0.26,0.3155,0.2009,0.2822,0.07526
-852552,M,16.65,21.38,110,904.6,0.1121,0.1457,0.1525,0.0917,0.1995,0.0633,0.8068,0.9017,5.455,102.6,0.006048,0.01882,0.02741,0.0113,0.01468,0.002801,26.46,31.56,177,2215,0.1805,0.3578,0.4695,0.2095,0.3613,0.09564
-852631,M,17.14,16.4,116,912.7,0.1186,0.2276,0.2229,0.1401,0.304,0.07413,1.046,0.976,7.276,111.4,0.008029,0.03799,0.03732,0.02397,0.02308,0.007444,22.25,21.4,152.4,1461,0.1545,0.3949,0.3853,0.255,0.4066,0.1059
-852763,M,14.58,21.53,97.41,644.8,0.1054,0.1868,0.1425,0.08783,0.2252,0.06924,0.2545,0.9832,2.11,21.05,0.004452,0.03055,0.02681,0.01352,0.01454,0.003711,17.62,33.21,122.4,896.9,0.1525,0.6643,0.5539,0.2701,0.4264,0.1275
-852781,M,18.61,20.25,122.1,1094,0.0944,0.1066,0.149,0.07731,0.1697,0.05699,0.8529,1.849,5.632,93.54,0.01075,0.02722,0.05081,0.01911,0.02293,0.004217,21.31,27.26,139.9,1403,0.1338,0.2117,0.3446,0.149,0.2341,0.07421
-852973,M,15.3,25.27,102.4,732.4,0.1082,0.1697,0.1683,0.08751,0.1926,0.0654,0.439,1.012,3.498,43.5,0.005233,0.03057,0.03576,0.01083,0.01768,0.002967,20.27,36.71,149.3,1269,0.1641,0.611,0.6335,0.2024,0.4027,0.09876
-853201,M,17.57,15.05,115,955.1,0.09847,0.1157,0.09875,0.07953,0.1739,0.06149,0.6003,0.8225,4.655,61.1,0.005627,0.03033,0.03407,0.01354,0.01925,0.003742,20.01,19.52,134.9,1227,0.1255,0.2812,0.2489,0.1456,0.2756,0.07919
-853401,M,18.63,25.11,124.8,1088,0.1064,0.1887,0.2319,0.1244,0.2183,0.06197,0.8307,1.466,5.574,105,0.006248,0.03374,0.05196,0.01158,0.02007,0.00456,23.15,34.01,160.5,1670,0.1491,0.4257,0.6133,0.1848,0.3444,0.09782
-853612,M,11.84,18.7,77.93,440.6,0.1109,0.1516,0.1218,0.05182,0.2301,0.07799,0.4825,1.03,3.475,41,0.005551,0.03414,0.04205,0.01044,0.02273,0.005667,16.82,28.12,119.4,888.7,0.1637,0.5775,0.6956,0.1546,0.4761,0.1402
-85382601,M,17.02,23.98,112.8,899.3,0.1197,0.1496,0.2417,0.1203,0.2248,0.06382,0.6009,1.398,3.999,67.78,0.008268,0.03082,0.05042,0.01112,0.02102,0.003854,20.88,32.09,136.1,1344,0.1634,0.3559,0.5588,0.1847,0.353,0.08482
-854002,M,19.27,26.47,127.9,1162,0.09401,0.1719,0.1657,0.07593,0.1853,0.06261,0.5558,0.6062,3.528,68.17,0.005015,0.03318,0.03497,0.009643,0.01543,0.003896,24.15,30.9,161.4,1813,0.1509,0.659,0.6091,0.1785,0.3672,0.1123
-854039,M,16.13,17.88,107,807.2,0.104,0.1559,0.1354,0.07752,0.1998,0.06515,0.334,0.6857,2.183,35.03,0.004185,0.02868,0.02664,0.009067,0.01703,0.003817,20.21,27.26,132.7,1261,0.1446,0.5804,0.5274,0.1864,0.427,0.1233
-854253,M,16.74,21.59,110.1,869.5,0.0961,0.1336,0.1348,0.06018,0.1896,0.05656,0.4615,0.9197,3.008,45.19,0.005776,0.02499,0.03695,0.01195,0.02789,0.002665,20.01,29.02,133.5,1229,0.1563,0.3835,0.5409,0.1813,0.4863,0.08633
-854268,M,14.25,21.72,93.63,633,0.09823,0.1098,0.1319,0.05598,0.1885,0.06125,0.286,1.019,2.657,24.91,0.005878,0.02995,0.04815,0.01161,0.02028,0.004022,15.89,30.36,116.2,799.6,0.1446,0.4238,0.5186,0.1447,0.3591,0.1014
-854941,B,13.03,18.42,82.61,523.8,0.08983,0.03766,0.02562,0.02923,0.1467,0.05863,0.1839,2.342,1.17,14.16,0.004352,0.004899,0.01343,0.01164,0.02671,0.001777,13.3,22.81,84.46,545.9,0.09701,0.04619,0.04833,0.05013,0.1987,0.06169
-855133,M,14.99,25.2,95.54,698.8,0.09387,0.05131,0.02398,0.02899,0.1565,0.05504,1.214,2.188,8.077,106,0.006883,0.01094,0.01818,0.01917,0.007882,0.001754,14.99,25.2,95.54,698.8,0.09387,0.05131,0.02398,0.02899,0.1565,0.05504
-855138,M,13.48,20.82,88.4,559.2,0.1016,0.1255,0.1063,0.05439,0.172,0.06419,0.213,0.5914,1.545,18.52,0.005367,0.02239,0.03049,0.01262,0.01377,0.003187,15.53,26.02,107.3,740.4,0.161,0.4225,0.503,0.2258,0.2807,0.1071
-855167,M,13.44,21.58,86.18,563,0.08162,0.06031,0.0311,0.02031,0.1784,0.05587,0.2385,0.8265,1.572,20.53,0.00328,0.01102,0.0139,0.006881,0.0138,0.001286,15.93,30.25,102.5,787.9,0.1094,0.2043,0.2085,0.1112,0.2994,0.07146
-855563,M,10.95,21.35,71.9,371.1,0.1227,0.1218,0.1044,0.05669,0.1895,0.0687,0.2366,1.428,1.822,16.97,0.008064,0.01764,0.02595,0.01037,0.01357,0.00304,12.84,35.34,87.22,514,0.1909,0.2698,0.4023,0.1424,0.2964,0.09606
-855625,M,19.07,24.81,128.3,1104,0.09081,0.219,0.2107,0.09961,0.231,0.06343,0.9811,1.666,8.83,104.9,0.006548,0.1006,0.09723,0.02638,0.05333,0.007646,24.09,33.17,177.4,1651,0.1247,0.7444,0.7242,0.2493,0.467,0.1038
-856106,M,13.28,20.28,87.32,545.2,0.1041,0.1436,0.09847,0.06158,0.1974,0.06782,0.3704,0.8249,2.427,31.33,0.005072,0.02147,0.02185,0.00956,0.01719,0.003317,17.38,28,113.1,907.2,0.153,0.3724,0.3664,0.1492,0.3739,0.1027
-85638502,M,13.17,21.81,85.42,531.5,0.09714,0.1047,0.08259,0.05252,0.1746,0.06177,0.1938,0.6123,1.334,14.49,0.00335,0.01384,0.01452,0.006853,0.01113,0.00172,16.23,29.89,105.5,740.7,0.1503,0.3904,0.3728,0.1607,0.3693,0.09618
-857010,M,18.65,17.6,123.7,1076,0.1099,0.1686,0.1974,0.1009,0.1907,0.06049,0.6289,0.6633,4.293,71.56,0.006294,0.03994,0.05554,0.01695,0.02428,0.003535,22.82,21.32,150.6,1567,0.1679,0.509,0.7345,0.2378,0.3799,0.09185
-85713702,B,8.196,16.84,51.71,201.9,0.086,0.05943,0.01588,0.005917,0.1769,0.06503,0.1563,0.9567,1.094,8.205,0.008968,0.01646,0.01588,0.005917,0.02574,0.002582,8.964,21.96,57.26,242.2,0.1297,0.1357,0.0688,0.02564,0.3105,0.07409
-85715,M,13.17,18.66,85.98,534.6,0.1158,0.1231,0.1226,0.0734,0.2128,0.06777,0.2871,0.8937,1.897,24.25,0.006532,0.02336,0.02905,0.01215,0.01743,0.003643,15.67,27.95,102.8,759.4,0.1786,0.4166,0.5006,0.2088,0.39,0.1179
-857155,B,12.05,14.63,78.04,449.3,0.1031,0.09092,0.06592,0.02749,0.1675,0.06043,0.2636,0.7294,1.848,19.87,0.005488,0.01427,0.02322,0.00566,0.01428,0.002422,13.76,20.7,89.88,582.6,0.1494,0.2156,0.305,0.06548,0.2747,0.08301
-857156,B,13.49,22.3,86.91,561,0.08752,0.07698,0.04751,0.03384,0.1809,0.05718,0.2338,1.353,1.735,20.2,0.004455,0.01382,0.02095,0.01184,0.01641,0.001956,15.15,31.82,99,698.8,0.1162,0.1711,0.2282,0.1282,0.2871,0.06917
-857343,B,11.76,21.6,74.72,427.9,0.08637,0.04966,0.01657,0.01115,0.1495,0.05888,0.4062,1.21,2.635,28.47,0.005857,0.009758,0.01168,0.007445,0.02406,0.001769,12.98,25.72,82.98,516.5,0.1085,0.08615,0.05523,0.03715,0.2433,0.06563
-857373,B,13.64,16.34,87.21,571.8,0.07685,0.06059,0.01857,0.01723,0.1353,0.05953,0.1872,0.9234,1.449,14.55,0.004477,0.01177,0.01079,0.007956,0.01325,0.002551,14.67,23.19,96.08,656.7,0.1089,0.1582,0.105,0.08586,0.2346,0.08025
-857374,B,11.94,18.24,75.71,437.6,0.08261,0.04751,0.01972,0.01349,0.1868,0.0611,0.2273,0.6329,1.52,17.47,0.00721,0.00838,0.01311,0.008,0.01996,0.002635,13.1,21.33,83.67,527.2,0.1144,0.08906,0.09203,0.06296,0.2785,0.07408
-857392,M,18.22,18.7,120.3,1033,0.1148,0.1485,0.1772,0.106,0.2092,0.0631,0.8337,1.593,4.877,98.81,0.003899,0.02961,0.02817,0.009222,0.02674,0.005126,20.6,24.13,135.1,1321,0.128,0.2297,0.2623,0.1325,0.3021,0.07987
-857438,M,15.1,22.02,97.26,712.8,0.09056,0.07081,0.05253,0.03334,0.1616,0.05684,0.3105,0.8339,2.097,29.91,0.004675,0.0103,0.01603,0.009222,0.01095,0.001629,18.1,31.69,117.7,1030,0.1389,0.2057,0.2712,0.153,0.2675,0.07873
-85759902,B,11.52,18.75,73.34,409,0.09524,0.05473,0.03036,0.02278,0.192,0.05907,0.3249,0.9591,2.183,23.47,0.008328,0.008722,0.01349,0.00867,0.03218,0.002386,12.84,22.47,81.81,506.2,0.1249,0.0872,0.09076,0.06316,0.3306,0.07036
-857637,M,19.21,18.57,125.5,1152,0.1053,0.1267,0.1323,0.08994,0.1917,0.05961,0.7275,1.193,4.837,102.5,0.006458,0.02306,0.02945,0.01538,0.01852,0.002608,26.14,28.14,170.1,2145,0.1624,0.3511,0.3879,0.2091,0.3537,0.08294
-857793,M,14.71,21.59,95.55,656.9,0.1137,0.1365,0.1293,0.08123,0.2027,0.06758,0.4226,1.15,2.735,40.09,0.003659,0.02855,0.02572,0.01272,0.01817,0.004108,17.87,30.7,115.7,985.5,0.1368,0.429,0.3587,0.1834,0.3698,0.1094
-857810,B,13.05,19.31,82.61,527.2,0.0806,0.03789,0.000692,0.004167,0.1819,0.05501,0.404,1.214,2.595,32.96,0.007491,0.008593,0.000692,0.004167,0.0219,0.00299,14.23,22.25,90.24,624.1,0.1021,0.06191,0.001845,0.01111,0.2439,0.06289
-858477,B,8.618,11.79,54.34,224.5,0.09752,0.05272,0.02061,0.007799,0.1683,0.07187,0.1559,0.5796,1.046,8.322,0.01011,0.01055,0.01981,0.005742,0.0209,0.002788,9.507,15.4,59.9,274.9,0.1733,0.1239,0.1168,0.04419,0.322,0.09026
-858970,B,10.17,14.88,64.55,311.9,0.1134,0.08061,0.01084,0.0129,0.2743,0.0696,0.5158,1.441,3.312,34.62,0.007514,0.01099,0.007665,0.008193,0.04183,0.005953,11.02,17.45,69.86,368.6,0.1275,0.09866,0.02168,0.02579,0.3557,0.0802
-858981,B,8.598,20.98,54.66,221.8,0.1243,0.08963,0.03,0.009259,0.1828,0.06757,0.3582,2.067,2.493,18.39,0.01193,0.03162,0.03,0.009259,0.03357,0.003048,9.565,27.04,62.06,273.9,0.1639,0.1698,0.09001,0.02778,0.2972,0.07712
-858986,M,14.25,22.15,96.42,645.7,0.1049,0.2008,0.2135,0.08653,0.1949,0.07292,0.7036,1.268,5.373,60.78,0.009407,0.07056,0.06899,0.01848,0.017,0.006113,17.67,29.51,119.1,959.5,0.164,0.6247,0.6922,0.1785,0.2844,0.1132
-859196,B,9.173,13.86,59.2,260.9,0.07721,0.08751,0.05988,0.0218,0.2341,0.06963,0.4098,2.265,2.608,23.52,0.008738,0.03938,0.04312,0.0156,0.04192,0.005822,10.01,19.23,65.59,310.1,0.09836,0.1678,0.1397,0.05087,0.3282,0.0849
-85922302,M,12.68,23.84,82.69,499,0.1122,0.1262,0.1128,0.06873,0.1905,0.0659,0.4255,1.178,2.927,36.46,0.007781,0.02648,0.02973,0.0129,0.01635,0.003601,17.09,33.47,111.8,888.3,0.1851,0.4061,0.4024,0.1716,0.3383,0.1031
-859283,M,14.78,23.94,97.4,668.3,0.1172,0.1479,0.1267,0.09029,0.1953,0.06654,0.3577,1.281,2.45,35.24,0.006703,0.0231,0.02315,0.01184,0.019,0.003224,17.31,33.39,114.6,925.1,0.1648,0.3416,0.3024,0.1614,0.3321,0.08911
-859464,B,9.465,21.01,60.11,269.4,0.1044,0.07773,0.02172,0.01504,0.1717,0.06899,0.2351,2.011,1.66,14.2,0.01052,0.01755,0.01714,0.009333,0.02279,0.004237,10.41,31.56,67.03,330.7,0.1548,0.1664,0.09412,0.06517,0.2878,0.09211
-859465,B,11.31,19.04,71.8,394.1,0.08139,0.04701,0.03709,0.0223,0.1516,0.05667,0.2727,0.9429,1.831,18.15,0.009282,0.009216,0.02063,0.008965,0.02183,0.002146,12.33,23.84,78,466.7,0.129,0.09148,0.1444,0.06961,0.24,0.06641
-859471,B,9.029,17.33,58.79,250.5,0.1066,0.1413,0.313,0.04375,0.2111,0.08046,0.3274,1.194,1.885,17.67,0.009549,0.08606,0.3038,0.03322,0.04197,0.009559,10.31,22.65,65.5,324.7,0.1482,0.4365,1.252,0.175,0.4228,0.1175
-859487,B,12.78,16.49,81.37,502.5,0.09831,0.05234,0.03653,0.02864,0.159,0.05653,0.2368,0.8732,1.471,18.33,0.007962,0.005612,0.01585,0.008662,0.02254,0.001906,13.46,19.76,85.67,554.9,0.1296,0.07061,0.1039,0.05882,0.2383,0.0641
-859575,M,18.94,21.31,123.6,1130,0.09009,0.1029,0.108,0.07951,0.1582,0.05461,0.7888,0.7975,5.486,96.05,0.004444,0.01652,0.02269,0.0137,0.01386,0.001698,24.86,26.58,165.9,1866,0.1193,0.2336,0.2687,0.1789,0.2551,0.06589
-859711,B,8.888,14.64,58.79,244,0.09783,0.1531,0.08606,0.02872,0.1902,0.0898,0.5262,0.8522,3.168,25.44,0.01721,0.09368,0.05671,0.01766,0.02541,0.02193,9.733,15.67,62.56,284.4,0.1207,0.2436,0.1434,0.04786,0.2254,0.1084
-859717,M,17.2,24.52,114.2,929.4,0.1071,0.183,0.1692,0.07944,0.1927,0.06487,0.5907,1.041,3.705,69.47,0.00582,0.05616,0.04252,0.01127,0.01527,0.006299,23.32,33.82,151.6,1681,0.1585,0.7394,0.6566,0.1899,0.3313,0.1339
-859983,M,13.8,15.79,90.43,584.1,0.1007,0.128,0.07789,0.05069,0.1662,0.06566,0.2787,0.6205,1.957,23.35,0.004717,0.02065,0.01759,0.009206,0.0122,0.00313,16.57,20.86,110.3,812.4,0.1411,0.3542,0.2779,0.1383,0.2589,0.103
-8610175,B,12.31,16.52,79.19,470.9,0.09172,0.06829,0.03372,0.02272,0.172,0.05914,0.2505,1.025,1.74,19.68,0.004854,0.01819,0.01826,0.007965,0.01386,0.002304,14.11,23.21,89.71,611.1,0.1176,0.1843,0.1703,0.0866,0.2618,0.07609
-8610404,M,16.07,19.65,104.1,817.7,0.09168,0.08424,0.09769,0.06638,0.1798,0.05391,0.7474,1.016,5.029,79.25,0.01082,0.02203,0.035,0.01809,0.0155,0.001948,19.77,24.56,128.8,1223,0.15,0.2045,0.2829,0.152,0.265,0.06387
-8610629,B,13.53,10.94,87.91,559.2,0.1291,0.1047,0.06877,0.06556,0.2403,0.06641,0.4101,1.014,2.652,32.65,0.0134,0.02839,0.01162,0.008239,0.02572,0.006164,14.08,12.49,91.36,605.5,0.1451,0.1379,0.08539,0.07407,0.271,0.07191
-8610637,M,18.05,16.15,120.2,1006,0.1065,0.2146,0.1684,0.108,0.2152,0.06673,0.9806,0.5505,6.311,134.8,0.00794,0.05839,0.04658,0.0207,0.02591,0.007054,22.39,18.91,150.1,1610,0.1478,0.5634,0.3786,0.2102,0.3751,0.1108
-8610862,M,20.18,23.97,143.7,1245,0.1286,0.3454,0.3754,0.1604,0.2906,0.08142,0.9317,1.885,8.649,116.4,0.01038,0.06835,0.1091,0.02593,0.07895,0.005987,23.37,31.72,170.3,1623,0.1639,0.6164,0.7681,0.2508,0.544,0.09964
-8610908,B,12.86,18,83.19,506.3,0.09934,0.09546,0.03889,0.02315,0.1718,0.05997,0.2655,1.095,1.778,20.35,0.005293,0.01661,0.02071,0.008179,0.01748,0.002848,14.24,24.82,91.88,622.1,0.1289,0.2141,0.1731,0.07926,0.2779,0.07918
-861103,B,11.45,20.97,73.81,401.5,0.1102,0.09362,0.04591,0.02233,0.1842,0.07005,0.3251,2.174,2.077,24.62,0.01037,0.01706,0.02586,0.007506,0.01816,0.003976,13.11,32.16,84.53,525.1,0.1557,0.1676,0.1755,0.06127,0.2762,0.08851
-8611161,B,13.34,15.86,86.49,520,0.1078,0.1535,0.1169,0.06987,0.1942,0.06902,0.286,1.016,1.535,12.96,0.006794,0.03575,0.0398,0.01383,0.02134,0.004603,15.53,23.19,96.66,614.9,0.1536,0.4791,0.4858,0.1708,0.3527,0.1016
-8611555,M,25.22,24.91,171.5,1878,0.1063,0.2665,0.3339,0.1845,0.1829,0.06782,0.8973,1.474,7.382,120,0.008166,0.05693,0.0573,0.0203,0.01065,0.005893,30,33.62,211.7,2562,0.1573,0.6076,0.6476,0.2867,0.2355,0.1051
-8611792,M,19.1,26.29,129.1,1132,0.1215,0.1791,0.1937,0.1469,0.1634,0.07224,0.519,2.91,5.801,67.1,0.007545,0.0605,0.02134,0.01843,0.03056,0.01039,20.33,32.72,141.3,1298,0.1392,0.2817,0.2432,0.1841,0.2311,0.09203
-8612080,B,12,15.65,76.95,443.3,0.09723,0.07165,0.04151,0.01863,0.2079,0.05968,0.2271,1.255,1.441,16.16,0.005969,0.01812,0.02007,0.007027,0.01972,0.002607,13.67,24.9,87.78,567.9,0.1377,0.2003,0.2267,0.07632,0.3379,0.07924
-8612399,M,18.46,18.52,121.1,1075,0.09874,0.1053,0.1335,0.08795,0.2132,0.06022,0.6997,1.475,4.782,80.6,0.006471,0.01649,0.02806,0.0142,0.0237,0.003755,22.93,27.68,152.2,1603,0.1398,0.2089,0.3157,0.1642,0.3695,0.08579
-86135501,M,14.48,21.46,94.25,648.2,0.09444,0.09947,0.1204,0.04938,0.2075,0.05636,0.4204,2.22,3.301,38.87,0.009369,0.02983,0.05371,0.01761,0.02418,0.003249,16.21,29.25,108.4,808.9,0.1306,0.1976,0.3349,0.1225,0.302,0.06846
-86135502,M,19.02,24.59,122,1076,0.09029,0.1206,0.1468,0.08271,0.1953,0.05629,0.5495,0.6636,3.055,57.65,0.003872,0.01842,0.0371,0.012,0.01964,0.003337,24.56,30.41,152.9,1623,0.1249,0.3206,0.5755,0.1956,0.3956,0.09288
-861597,B,12.36,21.8,79.78,466.1,0.08772,0.09445,0.06015,0.03745,0.193,0.06404,0.2978,1.502,2.203,20.95,0.007112,0.02493,0.02703,0.01293,0.01958,0.004463,13.83,30.5,91.46,574.7,0.1304,0.2463,0.2434,0.1205,0.2972,0.09261
-861598,B,14.64,15.24,95.77,651.9,0.1132,0.1339,0.09966,0.07064,0.2116,0.06346,0.5115,0.7372,3.814,42.76,0.005508,0.04412,0.04436,0.01623,0.02427,0.004841,16.34,18.24,109.4,803.6,0.1277,0.3089,0.2604,0.1397,0.3151,0.08473
-861648,B,14.62,24.02,94.57,662.7,0.08974,0.08606,0.03102,0.02957,0.1685,0.05866,0.3721,1.111,2.279,33.76,0.004868,0.01818,0.01121,0.008606,0.02085,0.002893,16.11,29.11,102.9,803.7,0.1115,0.1766,0.09189,0.06946,0.2522,0.07246
-861799,M,15.37,22.76,100.2,728.2,0.092,0.1036,0.1122,0.07483,0.1717,0.06097,0.3129,0.8413,2.075,29.44,0.009882,0.02444,0.04531,0.01763,0.02471,0.002142,16.43,25.84,107.5,830.9,0.1257,0.1997,0.2846,0.1476,0.2556,0.06828
-861853,B,13.27,14.76,84.74,551.7,0.07355,0.05055,0.03261,0.02648,0.1386,0.05318,0.4057,1.153,2.701,36.35,0.004481,0.01038,0.01358,0.01082,0.01069,0.001435,16.36,22.35,104.5,830.6,0.1006,0.1238,0.135,0.1001,0.2027,0.06206
-862009,B,13.45,18.3,86.6,555.1,0.1022,0.08165,0.03974,0.0278,0.1638,0.0571,0.295,1.373,2.099,25.22,0.005884,0.01491,0.01872,0.009366,0.01884,0.001817,15.1,25.94,97.59,699.4,0.1339,0.1751,0.1381,0.07911,0.2678,0.06603
-862028,M,15.06,19.83,100.3,705.6,0.1039,0.1553,0.17,0.08815,0.1855,0.06284,0.4768,0.9644,3.706,47.14,0.00925,0.03715,0.04867,0.01851,0.01498,0.00352,18.23,24.23,123.5,1025,0.1551,0.4203,0.5203,0.2115,0.2834,0.08234
-86208,M,20.26,23.03,132.4,1264,0.09078,0.1313,0.1465,0.08683,0.2095,0.05649,0.7576,1.509,4.554,87.87,0.006016,0.03482,0.04232,0.01269,0.02657,0.004411,24.22,31.59,156.1,1750,0.119,0.3539,0.4098,0.1573,0.3689,0.08368
-86211,B,12.18,17.84,77.79,451.1,0.1045,0.07057,0.0249,0.02941,0.19,0.06635,0.3661,1.511,2.41,24.44,0.005433,0.01179,0.01131,0.01519,0.0222,0.003408,12.83,20.92,82.14,495.2,0.114,0.09358,0.0498,0.05882,0.2227,0.07376
-862261,B,9.787,19.94,62.11,294.5,0.1024,0.05301,0.006829,0.007937,0.135,0.0689,0.335,2.043,2.132,20.05,0.01113,0.01463,0.005308,0.00525,0.01801,0.005667,10.92,26.29,68.81,366.1,0.1316,0.09473,0.02049,0.02381,0.1934,0.08988
-862485,B,11.6,12.84,74.34,412.6,0.08983,0.07525,0.04196,0.0335,0.162,0.06582,0.2315,0.5391,1.475,15.75,0.006153,0.0133,0.01693,0.006884,0.01651,0.002551,13.06,17.16,82.96,512.5,0.1431,0.1851,0.1922,0.08449,0.2772,0.08756
-862548,M,14.42,19.77,94.48,642.5,0.09752,0.1141,0.09388,0.05839,0.1879,0.0639,0.2895,1.851,2.376,26.85,0.008005,0.02895,0.03321,0.01424,0.01462,0.004452,16.33,30.86,109.5,826.4,0.1431,0.3026,0.3194,0.1565,0.2718,0.09353
-862717,M,13.61,24.98,88.05,582.7,0.09488,0.08511,0.08625,0.04489,0.1609,0.05871,0.4565,1.29,2.861,43.14,0.005872,0.01488,0.02647,0.009921,0.01465,0.002355,16.99,35.27,108.6,906.5,0.1265,0.1943,0.3169,0.1184,0.2651,0.07397
-862722,B,6.981,13.43,43.79,143.5,0.117,0.07568,0,0,0.193,0.07818,0.2241,1.508,1.553,9.833,0.01019,0.01084,0,0,0.02659,0.0041,7.93,19.54,50.41,185.2,0.1584,0.1202,0,0,0.2932,0.09382
-862965,B,12.18,20.52,77.22,458.7,0.08013,0.04038,0.02383,0.0177,0.1739,0.05677,0.1924,1.571,1.183,14.68,0.00508,0.006098,0.01069,0.006797,0.01447,0.001532,13.34,32.84,84.58,547.8,0.1123,0.08862,0.1145,0.07431,0.2694,0.06878
-862980,B,9.876,19.4,63.95,298.3,0.1005,0.09697,0.06154,0.03029,0.1945,0.06322,0.1803,1.222,1.528,11.77,0.009058,0.02196,0.03029,0.01112,0.01609,0.00357,10.76,26.83,72.22,361.2,0.1559,0.2302,0.2644,0.09749,0.2622,0.0849
-862989,B,10.49,19.29,67.41,336.1,0.09989,0.08578,0.02995,0.01201,0.2217,0.06481,0.355,1.534,2.302,23.13,0.007595,0.02219,0.0288,0.008614,0.0271,0.003451,11.54,23.31,74.22,402.8,0.1219,0.1486,0.07987,0.03203,0.2826,0.07552
-863030,M,13.11,15.56,87.21,530.2,0.1398,0.1765,0.2071,0.09601,0.1925,0.07692,0.3908,0.9238,2.41,34.66,0.007162,0.02912,0.05473,0.01388,0.01547,0.007098,16.31,22.4,106.4,827.2,0.1862,0.4099,0.6376,0.1986,0.3147,0.1405
-863031,B,11.64,18.33,75.17,412.5,0.1142,0.1017,0.0707,0.03485,0.1801,0.0652,0.306,1.657,2.155,20.62,0.00854,0.0231,0.02945,0.01398,0.01565,0.00384,13.14,29.26,85.51,521.7,0.1688,0.266,0.2873,0.1218,0.2806,0.09097
-863270,B,12.36,18.54,79.01,466.7,0.08477,0.06815,0.02643,0.01921,0.1602,0.06066,0.1199,0.8944,0.8484,9.227,0.003457,0.01047,0.01167,0.005558,0.01251,0.001356,13.29,27.49,85.56,544.1,0.1184,0.1963,0.1937,0.08442,0.2983,0.07185
-86355,M,22.27,19.67,152.8,1509,0.1326,0.2768,0.4264,0.1823,0.2556,0.07039,1.215,1.545,10.05,170,0.006515,0.08668,0.104,0.0248,0.03112,0.005037,28.4,28.01,206.8,2360,0.1701,0.6997,0.9608,0.291,0.4055,0.09789
-864018,B,11.34,21.26,72.48,396.5,0.08759,0.06575,0.05133,0.01899,0.1487,0.06529,0.2344,0.9861,1.597,16.41,0.009113,0.01557,0.02443,0.006435,0.01568,0.002477,13.01,29.15,83.99,518.1,0.1699,0.2196,0.312,0.08278,0.2829,0.08832
-864033,B,9.777,16.99,62.5,290.2,0.1037,0.08404,0.04334,0.01778,0.1584,0.07065,0.403,1.424,2.747,22.87,0.01385,0.02932,0.02722,0.01023,0.03281,0.004638,11.05,21.47,71.68,367,0.1467,0.1765,0.13,0.05334,0.2533,0.08468
-86408,B,12.63,20.76,82.15,480.4,0.09933,0.1209,0.1065,0.06021,0.1735,0.0707,0.3424,1.803,2.711,20.48,0.01291,0.04042,0.05101,0.02295,0.02144,0.005891,13.33,25.47,89,527.4,0.1287,0.225,0.2216,0.1105,0.2226,0.08486
-86409,B,14.26,19.65,97.83,629.9,0.07837,0.2233,0.3003,0.07798,0.1704,0.07769,0.3628,1.49,3.399,29.25,0.005298,0.07446,0.1435,0.02292,0.02566,0.01298,15.3,23.73,107,709,0.08949,0.4193,0.6783,0.1505,0.2398,0.1082
-864292,B,10.51,20.19,68.64,334.2,0.1122,0.1303,0.06476,0.03068,0.1922,0.07782,0.3336,1.86,2.041,19.91,0.01188,0.03747,0.04591,0.01544,0.02287,0.006792,11.16,22.75,72.62,374.4,0.13,0.2049,0.1295,0.06136,0.2383,0.09026
-864496,B,8.726,15.83,55.84,230.9,0.115,0.08201,0.04132,0.01924,0.1649,0.07633,0.1665,0.5864,1.354,8.966,0.008261,0.02213,0.03259,0.0104,0.01708,0.003806,9.628,19.62,64.48,284.4,0.1724,0.2364,0.2456,0.105,0.2926,0.1017
-864685,B,11.93,21.53,76.53,438.6,0.09768,0.07849,0.03328,0.02008,0.1688,0.06194,0.3118,0.9227,2,24.79,0.007803,0.02507,0.01835,0.007711,0.01278,0.003856,13.67,26.15,87.54,583,0.15,0.2399,0.1503,0.07247,0.2438,0.08541
-864726,B,8.95,15.76,58.74,245.2,0.09462,0.1243,0.09263,0.02308,0.1305,0.07163,0.3132,0.9789,3.28,16.94,0.01835,0.0676,0.09263,0.02308,0.02384,0.005601,9.414,17.07,63.34,270,0.1179,0.1879,0.1544,0.03846,0.1652,0.07722
-864729,M,14.87,16.67,98.64,682.5,0.1162,0.1649,0.169,0.08923,0.2157,0.06768,0.4266,0.9489,2.989,41.18,0.006985,0.02563,0.03011,0.01271,0.01602,0.003884,18.81,27.37,127.1,1095,0.1878,0.448,0.4704,0.2027,0.3585,0.1065
-864877,M,15.78,22.91,105.7,782.6,0.1155,0.1752,0.2133,0.09479,0.2096,0.07331,0.552,1.072,3.598,58.63,0.008699,0.03976,0.0595,0.0139,0.01495,0.005984,20.19,30.5,130.3,1272,0.1855,0.4925,0.7356,0.2034,0.3274,0.1252
-865128,M,17.95,20.01,114.2,982,0.08402,0.06722,0.07293,0.05596,0.2129,0.05025,0.5506,1.214,3.357,54.04,0.004024,0.008422,0.02291,0.009863,0.05014,0.001902,20.58,27.83,129.2,1261,0.1072,0.1202,0.2249,0.1185,0.4882,0.06111
-865137,B,11.41,10.82,73.34,403.3,0.09373,0.06685,0.03512,0.02623,0.1667,0.06113,0.1408,0.4607,1.103,10.5,0.00604,0.01529,0.01514,0.00646,0.01344,0.002206,12.82,15.97,83.74,510.5,0.1548,0.239,0.2102,0.08958,0.3016,0.08523
-86517,M,18.66,17.12,121.4,1077,0.1054,0.11,0.1457,0.08665,0.1966,0.06213,0.7128,1.581,4.895,90.47,0.008102,0.02101,0.03342,0.01601,0.02045,0.00457,22.25,24.9,145.4,1549,0.1503,0.2291,0.3272,0.1674,0.2894,0.08456
-865423,M,24.25,20.2,166.2,1761,0.1447,0.2867,0.4268,0.2012,0.2655,0.06877,1.509,3.12,9.807,233,0.02333,0.09806,0.1278,0.01822,0.04547,0.009875,26.02,23.99,180.9,2073,0.1696,0.4244,0.5803,0.2248,0.3222,0.08009
-865432,B,14.5,10.89,94.28,640.7,0.1101,0.1099,0.08842,0.05778,0.1856,0.06402,0.2929,0.857,1.928,24.19,0.003818,0.01276,0.02882,0.012,0.0191,0.002808,15.7,15.98,102.8,745.5,0.1313,0.1788,0.256,0.1221,0.2889,0.08006
-865468,B,13.37,16.39,86.1,553.5,0.07115,0.07325,0.08092,0.028,0.1422,0.05823,0.1639,1.14,1.223,14.66,0.005919,0.0327,0.04957,0.01038,0.01208,0.004076,14.26,22.75,91.99,632.1,0.1025,0.2531,0.3308,0.08978,0.2048,0.07628
-86561,B,13.85,17.21,88.44,588.7,0.08785,0.06136,0.0142,0.01141,0.1614,0.0589,0.2185,0.8561,1.495,17.91,0.004599,0.009169,0.009127,0.004814,0.01247,0.001708,15.49,23.58,100.3,725.9,0.1157,0.135,0.08115,0.05104,0.2364,0.07182
-866083,M,13.61,24.69,87.76,572.6,0.09258,0.07862,0.05285,0.03085,0.1761,0.0613,0.231,1.005,1.752,19.83,0.004088,0.01174,0.01796,0.00688,0.01323,0.001465,16.89,35.64,113.2,848.7,0.1471,0.2884,0.3796,0.1329,0.347,0.079
-866203,M,19,18.91,123.4,1138,0.08217,0.08028,0.09271,0.05627,0.1946,0.05044,0.6896,1.342,5.216,81.23,0.004428,0.02731,0.0404,0.01361,0.0203,0.002686,22.32,25.73,148.2,1538,0.1021,0.2264,0.3207,0.1218,0.2841,0.06541
-866458,B,15.1,16.39,99.58,674.5,0.115,0.1807,0.1138,0.08534,0.2001,0.06467,0.4309,1.068,2.796,39.84,0.009006,0.04185,0.03204,0.02258,0.02353,0.004984,16.11,18.33,105.9,762.6,0.1386,0.2883,0.196,0.1423,0.259,0.07779
-866674,M,19.79,25.12,130.4,1192,0.1015,0.1589,0.2545,0.1149,0.2202,0.06113,0.4953,1.199,2.765,63.33,0.005033,0.03179,0.04755,0.01043,0.01578,0.003224,22.63,33.58,148.7,1589,0.1275,0.3861,0.5673,0.1732,0.3305,0.08465
-866714,B,12.19,13.29,79.08,455.8,0.1066,0.09509,0.02855,0.02882,0.188,0.06471,0.2005,0.8163,1.973,15.24,0.006773,0.02456,0.01018,0.008094,0.02662,0.004143,13.34,17.81,91.38,545.2,0.1427,0.2585,0.09915,0.08187,0.3469,0.09241
-8670,M,15.46,19.48,101.7,748.9,0.1092,0.1223,0.1466,0.08087,0.1931,0.05796,0.4743,0.7859,3.094,48.31,0.00624,0.01484,0.02813,0.01093,0.01397,0.002461,19.26,26,124.9,1156,0.1546,0.2394,0.3791,0.1514,0.2837,0.08019
-86730502,M,16.16,21.54,106.2,809.8,0.1008,0.1284,0.1043,0.05613,0.216,0.05891,0.4332,1.265,2.844,43.68,0.004877,0.01952,0.02219,0.009231,0.01535,0.002373,19.47,31.68,129.7,1175,0.1395,0.3055,0.2992,0.1312,0.348,0.07619
-867387,B,15.71,13.93,102,761.7,0.09462,0.09462,0.07135,0.05933,0.1816,0.05723,0.3117,0.8155,1.972,27.94,0.005217,0.01515,0.01678,0.01268,0.01669,0.00233,17.5,19.25,114.3,922.8,0.1223,0.1949,0.1709,0.1374,0.2723,0.07071
-867739,M,18.45,21.91,120.2,1075,0.0943,0.09709,0.1153,0.06847,0.1692,0.05727,0.5959,1.202,3.766,68.35,0.006001,0.01422,0.02855,0.009148,0.01492,0.002205,22.52,31.39,145.6,1590,0.1465,0.2275,0.3965,0.1379,0.3109,0.0761
-868202,M,12.77,22.47,81.72,506.3,0.09055,0.05761,0.04711,0.02704,0.1585,0.06065,0.2367,1.38,1.457,19.87,0.007499,0.01202,0.02332,0.00892,0.01647,0.002629,14.49,33.37,92.04,653.6,0.1419,0.1523,0.2177,0.09331,0.2829,0.08067
-868223,B,11.71,16.67,74.72,423.6,0.1051,0.06095,0.03592,0.026,0.1339,0.05945,0.4489,2.508,3.258,34.37,0.006578,0.0138,0.02662,0.01307,0.01359,0.003707,13.33,25.48,86.16,546.7,0.1271,0.1028,0.1046,0.06968,0.1712,0.07343
-868682,B,11.43,15.39,73.06,399.8,0.09639,0.06889,0.03503,0.02875,0.1734,0.05865,0.1759,0.9938,1.143,12.67,0.005133,0.01521,0.01434,0.008602,0.01501,0.001588,12.32,22.02,79.93,462,0.119,0.1648,0.1399,0.08476,0.2676,0.06765
-868826,M,14.95,17.57,96.85,678.1,0.1167,0.1305,0.1539,0.08624,0.1957,0.06216,1.296,1.452,8.419,101.9,0.01,0.0348,0.06577,0.02801,0.05168,0.002887,18.55,21.43,121.4,971.4,0.1411,0.2164,0.3355,0.1667,0.3414,0.07147
-868871,B,11.28,13.39,73,384.8,0.1164,0.1136,0.04635,0.04796,0.1771,0.06072,0.3384,1.343,1.851,26.33,0.01127,0.03498,0.02187,0.01965,0.0158,0.003442,11.92,15.77,76.53,434,0.1367,0.1822,0.08669,0.08611,0.2102,0.06784
-868999,B,9.738,11.97,61.24,288.5,0.0925,0.04102,0,0,0.1903,0.06422,0.1988,0.496,1.218,12.26,0.00604,0.005656,0,0,0.02277,0.00322,10.62,14.1,66.53,342.9,0.1234,0.07204,0,0,0.3105,0.08151
-869104,M,16.11,18.05,105.1,813,0.09721,0.1137,0.09447,0.05943,0.1861,0.06248,0.7049,1.332,4.533,74.08,0.00677,0.01938,0.03067,0.01167,0.01875,0.003434,19.92,25.27,129,1233,0.1314,0.2236,0.2802,0.1216,0.2792,0.08158
-869218,B,11.43,17.31,73.66,398,0.1092,0.09486,0.02031,0.01861,0.1645,0.06562,0.2843,1.908,1.937,21.38,0.006664,0.01735,0.01158,0.00952,0.02282,0.003526,12.78,26.76,82.66,503,0.1413,0.1792,0.07708,0.06402,0.2584,0.08096
-869224,B,12.9,15.92,83.74,512.2,0.08677,0.09509,0.04894,0.03088,0.1778,0.06235,0.2143,0.7712,1.689,16.64,0.005324,0.01563,0.0151,0.007584,0.02104,0.001887,14.48,21.82,97.17,643.8,0.1312,0.2548,0.209,0.1012,0.3549,0.08118
-869254,B,10.75,14.97,68.26,355.3,0.07793,0.05139,0.02251,0.007875,0.1399,0.05688,0.2525,1.239,1.806,17.74,0.006547,0.01781,0.02018,0.005612,0.01671,0.00236,11.95,20.72,77.79,441.2,0.1076,0.1223,0.09755,0.03413,0.23,0.06769
-869476,B,11.9,14.65,78.11,432.8,0.1152,0.1296,0.0371,0.03003,0.1995,0.07839,0.3962,0.6538,3.021,25.03,0.01017,0.04741,0.02789,0.0111,0.03127,0.009423,13.15,16.51,86.26,509.6,0.1424,0.2517,0.0942,0.06042,0.2727,0.1036
-869691,M,11.8,16.58,78.99,432,0.1091,0.17,0.1659,0.07415,0.2678,0.07371,0.3197,1.426,2.281,24.72,0.005427,0.03633,0.04649,0.01843,0.05628,0.004635,13.74,26.38,91.93,591.7,0.1385,0.4092,0.4504,0.1865,0.5774,0.103
-86973701,B,14.95,18.77,97.84,689.5,0.08138,0.1167,0.0905,0.03562,0.1744,0.06493,0.422,1.909,3.271,39.43,0.00579,0.04877,0.05303,0.01527,0.03356,0.009368,16.25,25.47,107.1,809.7,0.0997,0.2521,0.25,0.08405,0.2852,0.09218
-86973702,B,14.44,15.18,93.97,640.1,0.0997,0.1021,0.08487,0.05532,0.1724,0.06081,0.2406,0.7394,2.12,21.2,0.005706,0.02297,0.03114,0.01493,0.01454,0.002528,15.85,19.85,108.6,766.9,0.1316,0.2735,0.3103,0.1599,0.2691,0.07683
-869931,B,13.74,17.91,88.12,585,0.07944,0.06376,0.02881,0.01329,0.1473,0.0558,0.25,0.7574,1.573,21.47,0.002838,0.01592,0.0178,0.005828,0.01329,0.001976,15.34,22.46,97.19,725.9,0.09711,0.1824,0.1564,0.06019,0.235,0.07014
-871001501,B,13,20.78,83.51,519.4,0.1135,0.07589,0.03136,0.02645,0.254,0.06087,0.4202,1.322,2.873,34.78,0.007017,0.01142,0.01949,0.01153,0.02951,0.001533,14.16,24.11,90.82,616.7,0.1297,0.1105,0.08112,0.06296,0.3196,0.06435
-871001502,B,8.219,20.7,53.27,203.9,0.09405,0.1305,0.1321,0.02168,0.2222,0.08261,0.1935,1.962,1.243,10.21,0.01243,0.05416,0.07753,0.01022,0.02309,0.01178,9.092,29.72,58.08,249.8,0.163,0.431,0.5381,0.07879,0.3322,0.1486
-8710441,B,9.731,15.34,63.78,300.2,0.1072,0.1599,0.4108,0.07857,0.2548,0.09296,0.8245,2.664,4.073,49.85,0.01097,0.09586,0.396,0.05279,0.03546,0.02984,11.02,19.49,71.04,380.5,0.1292,0.2772,0.8216,0.1571,0.3108,0.1259
-87106,B,11.15,13.08,70.87,381.9,0.09754,0.05113,0.01982,0.01786,0.183,0.06105,0.2251,0.7815,1.429,15.48,0.009019,0.008985,0.01196,0.008232,0.02388,0.001619,11.99,16.3,76.25,440.8,0.1341,0.08971,0.07116,0.05506,0.2859,0.06772
-8711002,B,13.15,15.34,85.31,538.9,0.09384,0.08498,0.09293,0.03483,0.1822,0.06207,0.271,0.7927,1.819,22.79,0.008584,0.02017,0.03047,0.009536,0.02769,0.003479,14.77,20.5,97.67,677.3,0.1478,0.2256,0.3009,0.09722,0.3849,0.08633
-8711003,B,12.25,17.94,78.27,460.3,0.08654,0.06679,0.03885,0.02331,0.197,0.06228,0.22,0.9823,1.484,16.51,0.005518,0.01562,0.01994,0.007924,0.01799,0.002484,13.59,25.22,86.6,564.2,0.1217,0.1788,0.1943,0.08211,0.3113,0.08132
-8711202,M,17.68,20.74,117.4,963.7,0.1115,0.1665,0.1855,0.1054,0.1971,0.06166,0.8113,1.4,5.54,93.91,0.009037,0.04954,0.05206,0.01841,0.01778,0.004968,20.47,25.11,132.9,1302,0.1418,0.3498,0.3583,0.1515,0.2463,0.07738
-8711216,B,16.84,19.46,108.4,880.2,0.07445,0.07223,0.0515,0.02771,0.1844,0.05268,0.4789,2.06,3.479,46.61,0.003443,0.02661,0.03056,0.0111,0.0152,0.001519,18.22,28.07,120.3,1032,0.08774,0.171,0.1882,0.08436,0.2527,0.05972
-871122,B,12.06,12.74,76.84,448.6,0.09311,0.05241,0.01972,0.01963,0.159,0.05907,0.1822,0.7285,1.171,13.25,0.005528,0.009789,0.008342,0.006273,0.01465,0.00253,13.14,18.41,84.08,532.8,0.1275,0.1232,0.08636,0.07025,0.2514,0.07898
-871149,B,10.9,12.96,68.69,366.8,0.07515,0.03718,0.00309,0.006588,0.1442,0.05743,0.2818,0.7614,1.808,18.54,0.006142,0.006134,0.001835,0.003576,0.01637,0.002665,12.36,18.2,78.07,470,0.1171,0.08294,0.01854,0.03953,0.2738,0.07685
-8711561,B,11.75,20.18,76.1,419.8,0.1089,0.1141,0.06843,0.03738,0.1993,0.06453,0.5018,1.693,3.926,38.34,0.009433,0.02405,0.04167,0.01152,0.03397,0.005061,13.32,26.21,88.91,543.9,0.1358,0.1892,0.1956,0.07909,0.3168,0.07987
-8711803,M,19.19,15.94,126.3,1157,0.08694,0.1185,0.1193,0.09667,0.1741,0.05176,1,0.6336,6.971,119.3,0.009406,0.03055,0.04344,0.02794,0.03156,0.003362,22.03,17.81,146.6,1495,0.1124,0.2016,0.2264,0.1777,0.2443,0.06251
-871201,M,19.59,18.15,130.7,1214,0.112,0.1666,0.2508,0.1286,0.2027,0.06082,0.7364,1.048,4.792,97.07,0.004057,0.02277,0.04029,0.01303,0.01686,0.003318,26.73,26.39,174.9,2232,0.1438,0.3846,0.681,0.2247,0.3643,0.09223
-8712064,B,12.34,22.22,79.85,464.5,0.1012,0.1015,0.0537,0.02822,0.1551,0.06761,0.2949,1.656,1.955,21.55,0.01134,0.03175,0.03125,0.01135,0.01879,0.005348,13.58,28.68,87.36,553,0.1452,0.2338,0.1688,0.08194,0.2268,0.09082
-8712289,M,23.27,22.04,152.1,1686,0.08439,0.1145,0.1324,0.09702,0.1801,0.05553,0.6642,0.8561,4.603,97.85,0.00491,0.02544,0.02822,0.01623,0.01956,0.00374,28.01,28.22,184.2,2403,0.1228,0.3583,0.3948,0.2346,0.3589,0.09187
-8712291,B,14.97,19.76,95.5,690.2,0.08421,0.05352,0.01947,0.01939,0.1515,0.05266,0.184,1.065,1.286,16.64,0.003634,0.007983,0.008268,0.006432,0.01924,0.00152,15.98,25.82,102.3,782.1,0.1045,0.09995,0.0775,0.05754,0.2646,0.06085
-87127,B,10.8,9.71,68.77,357.6,0.09594,0.05736,0.02531,0.01698,0.1381,0.064,0.1728,0.4064,1.126,11.48,0.007809,0.009816,0.01099,0.005344,0.01254,0.00212,11.6,12.02,73.66,414,0.1436,0.1257,0.1047,0.04603,0.209,0.07699
-8712729,M,16.78,18.8,109.3,886.3,0.08865,0.09182,0.08422,0.06576,0.1893,0.05534,0.599,1.391,4.129,67.34,0.006123,0.0247,0.02626,0.01604,0.02091,0.003493,20.05,26.3,130.7,1260,0.1168,0.2119,0.2318,0.1474,0.281,0.07228
-8712766,M,17.47,24.68,116.1,984.6,0.1049,0.1603,0.2159,0.1043,0.1538,0.06365,1.088,1.41,7.337,122.3,0.006174,0.03634,0.04644,0.01569,0.01145,0.00512,23.14,32.33,155.3,1660,0.1376,0.383,0.489,0.1721,0.216,0.093
-8712853,B,14.97,16.95,96.22,685.9,0.09855,0.07885,0.02602,0.03781,0.178,0.0565,0.2713,1.217,1.893,24.28,0.00508,0.0137,0.007276,0.009073,0.0135,0.001706,16.11,23,104.6,793.7,0.1216,0.1637,0.06648,0.08485,0.2404,0.06428
-87139402,B,12.32,12.39,78.85,464.1,0.1028,0.06981,0.03987,0.037,0.1959,0.05955,0.236,0.6656,1.67,17.43,0.008045,0.0118,0.01683,0.01241,0.01924,0.002248,13.5,15.64,86.97,549.1,0.1385,0.1266,0.1242,0.09391,0.2827,0.06771
-87163,M,13.43,19.63,85.84,565.4,0.09048,0.06288,0.05858,0.03438,0.1598,0.05671,0.4697,1.147,3.142,43.4,0.006003,0.01063,0.02151,0.009443,0.0152,0.001868,17.98,29.87,116.6,993.6,0.1401,0.1546,0.2644,0.116,0.2884,0.07371
-87164,M,15.46,11.89,102.5,736.9,0.1257,0.1555,0.2032,0.1097,0.1966,0.07069,0.4209,0.6583,2.805,44.64,0.005393,0.02321,0.04303,0.0132,0.01792,0.004168,18.79,17.04,125,1102,0.1531,0.3583,0.583,0.1827,0.3216,0.101
-871641,B,11.08,14.71,70.21,372.7,0.1006,0.05743,0.02363,0.02583,0.1566,0.06669,0.2073,1.805,1.377,19.08,0.01496,0.02121,0.01453,0.01583,0.03082,0.004785,11.35,16.82,72.01,396.5,0.1216,0.0824,0.03938,0.04306,0.1902,0.07313
-871642,B,10.66,15.15,67.49,349.6,0.08792,0.04302,0,0,0.1928,0.05975,0.3309,1.925,2.155,21.98,0.008713,0.01017,0,0,0.03265,0.001002,11.54,19.2,73.2,408.3,0.1076,0.06791,0,0,0.271,0.06164
-872113,B,8.671,14.45,54.42,227.2,0.09138,0.04276,0,0,0.1722,0.06724,0.2204,0.7873,1.435,11.36,0.009172,0.008007,0,0,0.02711,0.003399,9.262,17.04,58.36,259.2,0.1162,0.07057,0,0,0.2592,0.07848
-872608,B,9.904,18.06,64.6,302.4,0.09699,0.1294,0.1307,0.03716,0.1669,0.08116,0.4311,2.261,3.132,27.48,0.01286,0.08808,0.1197,0.0246,0.0388,0.01792,11.26,24.39,73.07,390.2,0.1301,0.295,0.3486,0.0991,0.2614,0.1162
-87281702,M,16.46,20.11,109.3,832.9,0.09831,0.1556,0.1793,0.08866,0.1794,0.06323,0.3037,1.284,2.482,31.59,0.006627,0.04094,0.05371,0.01813,0.01682,0.004584,17.79,28.45,123.5,981.2,0.1415,0.4667,0.5862,0.2035,0.3054,0.09519
-873357,B,13.01,22.22,82.01,526.4,0.06251,0.01938,0.001595,0.001852,0.1395,0.05234,0.1731,1.142,1.101,14.34,0.003418,0.002252,0.001595,0.001852,0.01613,0.0009683,14,29.02,88.18,608.8,0.08125,0.03432,0.007977,0.009259,0.2295,0.05843
-873586,B,12.81,13.06,81.29,508.8,0.08739,0.03774,0.009193,0.0133,0.1466,0.06133,0.2889,0.9899,1.778,21.79,0.008534,0.006364,0.00618,0.007408,0.01065,0.003351,13.63,16.15,86.7,570.7,0.1162,0.05445,0.02758,0.0399,0.1783,0.07319
-873592,M,27.22,21.87,182.1,2250,0.1094,0.1914,0.2871,0.1878,0.18,0.0577,0.8361,1.481,5.82,128.7,0.004631,0.02537,0.03109,0.01241,0.01575,0.002747,33.12,32.85,220.8,3216,0.1472,0.4034,0.534,0.2688,0.2856,0.08082
-873593,M,21.09,26.57,142.7,1311,0.1141,0.2832,0.2487,0.1496,0.2395,0.07398,0.6298,0.7629,4.414,81.46,0.004253,0.04759,0.03872,0.01567,0.01798,0.005295,26.68,33.48,176.5,2089,0.1491,0.7584,0.678,0.2903,0.4098,0.1284
-873701,M,15.7,20.31,101.2,766.6,0.09597,0.08799,0.06593,0.05189,0.1618,0.05549,0.3699,1.15,2.406,40.98,0.004626,0.02263,0.01954,0.009767,0.01547,0.00243,20.11,32.82,129.3,1269,0.1414,0.3547,0.2902,0.1541,0.3437,0.08631
-873843,B,11.41,14.92,73.53,402,0.09059,0.08155,0.06181,0.02361,0.1167,0.06217,0.3344,1.108,1.902,22.77,0.007356,0.03728,0.05915,0.01712,0.02165,0.004784,12.37,17.7,79.12,467.2,0.1121,0.161,0.1648,0.06296,0.1811,0.07427
-873885,M,15.28,22.41,98.92,710.6,0.09057,0.1052,0.05375,0.03263,0.1727,0.06317,0.2054,0.4956,1.344,19.53,0.00329,0.01395,0.01774,0.006009,0.01172,0.002575,17.8,28.03,113.8,973.1,0.1301,0.3299,0.363,0.1226,0.3175,0.09772
-874158,B,10.08,15.11,63.76,317.5,0.09267,0.04695,0.001597,0.002404,0.1703,0.06048,0.4245,1.268,2.68,26.43,0.01439,0.012,0.001597,0.002404,0.02538,0.00347,11.87,21.18,75.39,437,0.1521,0.1019,0.00692,0.01042,0.2933,0.07697
-874217,M,18.31,18.58,118.6,1041,0.08588,0.08468,0.08169,0.05814,0.1621,0.05425,0.2577,0.4757,1.817,28.92,0.002866,0.009181,0.01412,0.006719,0.01069,0.001087,21.31,26.36,139.2,1410,0.1234,0.2445,0.3538,0.1571,0.3206,0.06938
-874373,B,11.71,17.19,74.68,420.3,0.09774,0.06141,0.03809,0.03239,0.1516,0.06095,0.2451,0.7655,1.742,17.86,0.006905,0.008704,0.01978,0.01185,0.01897,0.001671,13.01,21.39,84.42,521.5,0.1323,0.104,0.1521,0.1099,0.2572,0.07097
-874662,B,11.81,17.39,75.27,428.9,0.1007,0.05562,0.02353,0.01553,0.1718,0.0578,0.1859,1.926,1.011,14.47,0.007831,0.008776,0.01556,0.00624,0.03139,0.001988,12.57,26.48,79.57,489.5,0.1356,0.1,0.08803,0.04306,0.32,0.06576
-874839,B,12.3,15.9,78.83,463.7,0.0808,0.07253,0.03844,0.01654,0.1667,0.05474,0.2382,0.8355,1.687,18.32,0.005996,0.02212,0.02117,0.006433,0.02025,0.001725,13.35,19.59,86.65,546.7,0.1096,0.165,0.1423,0.04815,0.2482,0.06306
-874858,M,14.22,23.12,94.37,609.9,0.1075,0.2413,0.1981,0.06618,0.2384,0.07542,0.286,2.11,2.112,31.72,0.00797,0.1354,0.1166,0.01666,0.05113,0.01172,15.74,37.18,106.4,762.4,0.1533,0.9327,0.8488,0.1772,0.5166,0.1446
-875093,B,12.77,21.41,82.02,507.4,0.08749,0.06601,0.03112,0.02864,0.1694,0.06287,0.7311,1.748,5.118,53.65,0.004571,0.0179,0.02176,0.01757,0.03373,0.005875,13.75,23.5,89.04,579.5,0.09388,0.08978,0.05186,0.04773,0.2179,0.06871
-875099,B,9.72,18.22,60.73,288.1,0.0695,0.02344,0,0,0.1653,0.06447,0.3539,4.885,2.23,21.69,0.001713,0.006736,0,0,0.03799,0.001688,9.968,20.83,62.25,303.8,0.07117,0.02729,0,0,0.1909,0.06559
-875263,M,12.34,26.86,81.15,477.4,0.1034,0.1353,0.1085,0.04562,0.1943,0.06937,0.4053,1.809,2.642,34.44,0.009098,0.03845,0.03763,0.01321,0.01878,0.005672,15.65,39.34,101.7,768.9,0.1785,0.4706,0.4425,0.1459,0.3215,0.1205
-87556202,M,14.86,23.21,100.4,671.4,0.1044,0.198,0.1697,0.08878,0.1737,0.06672,0.2796,0.9622,3.591,25.2,0.008081,0.05122,0.05551,0.01883,0.02545,0.004312,16.08,27.78,118.6,784.7,0.1316,0.4648,0.4589,0.1727,0.3,0.08701
-875878,B,12.91,16.33,82.53,516.4,0.07941,0.05366,0.03873,0.02377,0.1829,0.05667,0.1942,0.9086,1.493,15.75,0.005298,0.01587,0.02321,0.00842,0.01853,0.002152,13.88,22,90.81,600.6,0.1097,0.1506,0.1764,0.08235,0.3024,0.06949
-875938,M,13.77,22.29,90.63,588.9,0.12,0.1267,0.1385,0.06526,0.1834,0.06877,0.6191,2.112,4.906,49.7,0.0138,0.03348,0.04665,0.0206,0.02689,0.004306,16.39,34.01,111.6,806.9,0.1737,0.3122,0.3809,0.1673,0.308,0.09333
-877159,M,18.08,21.84,117.4,1024,0.07371,0.08642,0.1103,0.05778,0.177,0.0534,0.6362,1.305,4.312,76.36,0.00553,0.05296,0.0611,0.01444,0.0214,0.005036,19.76,24.7,129.1,1228,0.08822,0.1963,0.2535,0.09181,0.2369,0.06558
-877486,M,19.18,22.49,127.5,1148,0.08523,0.1428,0.1114,0.06772,0.1767,0.05529,0.4357,1.073,3.833,54.22,0.005524,0.03698,0.02706,0.01221,0.01415,0.003397,23.36,32.06,166.4,1688,0.1322,0.5601,0.3865,0.1708,0.3193,0.09221
-877500,M,14.45,20.22,94.49,642.7,0.09872,0.1206,0.118,0.0598,0.195,0.06466,0.2092,0.6509,1.446,19.42,0.004044,0.01597,0.02,0.007303,0.01522,0.001976,18.33,30.12,117.9,1044,0.1552,0.4056,0.4967,0.1838,0.4753,0.1013
-877501,B,12.23,19.56,78.54,461,0.09586,0.08087,0.04187,0.04107,0.1979,0.06013,0.3534,1.326,2.308,27.24,0.007514,0.01779,0.01401,0.0114,0.01503,0.003338,14.44,28.36,92.15,638.4,0.1429,0.2042,0.1377,0.108,0.2668,0.08174
-877989,M,17.54,19.32,115.1,951.6,0.08968,0.1198,0.1036,0.07488,0.1506,0.05491,0.3971,0.8282,3.088,40.73,0.00609,0.02569,0.02713,0.01345,0.01594,0.002658,20.42,25.84,139.5,1239,0.1381,0.342,0.3508,0.1939,0.2928,0.07867
-878796,M,23.29,26.67,158.9,1685,0.1141,0.2084,0.3523,0.162,0.22,0.06229,0.5539,1.56,4.667,83.16,0.009327,0.05121,0.08958,0.02465,0.02175,0.005195,25.12,32.68,177,1986,0.1536,0.4167,0.7892,0.2733,0.3198,0.08762
-87880,M,13.81,23.75,91.56,597.8,0.1323,0.1768,0.1558,0.09176,0.2251,0.07421,0.5648,1.93,3.909,52.72,0.008824,0.03108,0.03112,0.01291,0.01998,0.004506,19.2,41.85,128.5,1153,0.2226,0.5209,0.4646,0.2013,0.4432,0.1086
-87930,B,12.47,18.6,81.09,481.9,0.09965,0.1058,0.08005,0.03821,0.1925,0.06373,0.3961,1.044,2.497,30.29,0.006953,0.01911,0.02701,0.01037,0.01782,0.003586,14.97,24.64,96.05,677.9,0.1426,0.2378,0.2671,0.1015,0.3014,0.0875
-879523,M,15.12,16.68,98.78,716.6,0.08876,0.09588,0.0755,0.04079,0.1594,0.05986,0.2711,0.3621,1.974,26.44,0.005472,0.01919,0.02039,0.00826,0.01523,0.002881,17.77,20.24,117.7,989.5,0.1491,0.3331,0.3327,0.1252,0.3415,0.0974
-879804,B,9.876,17.27,62.92,295.4,0.1089,0.07232,0.01756,0.01952,0.1934,0.06285,0.2137,1.342,1.517,12.33,0.009719,0.01249,0.007975,0.007527,0.0221,0.002472,10.42,23.22,67.08,331.6,0.1415,0.1247,0.06213,0.05588,0.2989,0.0738
-879830,M,17.01,20.26,109.7,904.3,0.08772,0.07304,0.0695,0.0539,0.2026,0.05223,0.5858,0.8554,4.106,68.46,0.005038,0.01503,0.01946,0.01123,0.02294,0.002581,19.8,25.05,130,1210,0.1111,0.1486,0.1932,0.1096,0.3275,0.06469
-8810158,B,13.11,22.54,87.02,529.4,0.1002,0.1483,0.08705,0.05102,0.185,0.0731,0.1931,0.9223,1.491,15.09,0.005251,0.03041,0.02526,0.008304,0.02514,0.004198,14.55,29.16,99.48,639.3,0.1349,0.4402,0.3162,0.1126,0.4128,0.1076
-8810436,B,15.27,12.91,98.17,725.5,0.08182,0.0623,0.05892,0.03157,0.1359,0.05526,0.2134,0.3628,1.525,20,0.004291,0.01236,0.01841,0.007373,0.009539,0.001656,17.38,15.92,113.7,932.7,0.1222,0.2186,0.2962,0.1035,0.232,0.07474
-881046502,M,20.58,22.14,134.7,1290,0.0909,0.1348,0.164,0.09561,0.1765,0.05024,0.8601,1.48,7.029,111.7,0.008124,0.03611,0.05489,0.02765,0.03176,0.002365,23.24,27.84,158.3,1656,0.1178,0.292,0.3861,0.192,0.2909,0.05865
-8810528,B,11.84,18.94,75.51,428,0.08871,0.069,0.02669,0.01393,0.1533,0.06057,0.2222,0.8652,1.444,17.12,0.005517,0.01727,0.02045,0.006747,0.01616,0.002922,13.3,24.99,85.22,546.3,0.128,0.188,0.1471,0.06913,0.2535,0.07993
-8810703,M,28.11,18.47,188.5,2499,0.1142,0.1516,0.3201,0.1595,0.1648,0.05525,2.873,1.476,21.98,525.6,0.01345,0.02772,0.06389,0.01407,0.04783,0.004476,28.11,18.47,188.5,2499,0.1142,0.1516,0.3201,0.1595,0.1648,0.05525
-881094802,M,17.42,25.56,114.5,948,0.1006,0.1146,0.1682,0.06597,0.1308,0.05866,0.5296,1.667,3.767,58.53,0.03113,0.08555,0.1438,0.03927,0.02175,0.01256,18.07,28.07,120.4,1021,0.1243,0.1793,0.2803,0.1099,0.1603,0.06818
-8810955,M,14.19,23.81,92.87,610.7,0.09463,0.1306,0.1115,0.06462,0.2235,0.06433,0.4207,1.845,3.534,31,0.01088,0.0371,0.03688,0.01627,0.04499,0.004768,16.86,34.85,115,811.3,0.1559,0.4059,0.3744,0.1772,0.4724,0.1026
-8810987,M,13.86,16.93,90.96,578.9,0.1026,0.1517,0.09901,0.05602,0.2106,0.06916,0.2563,1.194,1.933,22.69,0.00596,0.03438,0.03909,0.01435,0.01939,0.00456,15.75,26.93,104.4,750.1,0.146,0.437,0.4636,0.1654,0.363,0.1059
-8811523,B,11.89,18.35,77.32,432.2,0.09363,0.1154,0.06636,0.03142,0.1967,0.06314,0.2963,1.563,2.087,21.46,0.008872,0.04192,0.05946,0.01785,0.02793,0.004775,13.25,27.1,86.2,531.2,0.1405,0.3046,0.2806,0.1138,0.3397,0.08365
-8811779,B,10.2,17.48,65.05,321.2,0.08054,0.05907,0.05774,0.01071,0.1964,0.06315,0.3567,1.922,2.747,22.79,0.00468,0.0312,0.05774,0.01071,0.0256,0.004613,11.48,24.47,75.4,403.7,0.09527,0.1397,0.1925,0.03571,0.2868,0.07809
-8811842,M,19.8,21.56,129.7,1230,0.09383,0.1306,0.1272,0.08691,0.2094,0.05581,0.9553,1.186,6.487,124.4,0.006804,0.03169,0.03446,0.01712,0.01897,0.004045,25.73,28.64,170.3,2009,0.1353,0.3235,0.3617,0.182,0.307,0.08255
-88119002,M,19.53,32.47,128,1223,0.0842,0.113,0.1145,0.06637,0.1428,0.05313,0.7392,1.321,4.722,109.9,0.005539,0.02644,0.02664,0.01078,0.01332,0.002256,27.9,45.41,180.2,2477,0.1408,0.4097,0.3995,0.1625,0.2713,0.07568
-8812816,B,13.65,13.16,87.88,568.9,0.09646,0.08711,0.03888,0.02563,0.136,0.06344,0.2102,0.4336,1.391,17.4,0.004133,0.01695,0.01652,0.006659,0.01371,0.002735,15.34,16.35,99.71,706.2,0.1311,0.2474,0.1759,0.08056,0.238,0.08718
-8812818,B,13.56,13.9,88.59,561.3,0.1051,0.1192,0.0786,0.04451,0.1962,0.06303,0.2569,0.4981,2.011,21.03,0.005851,0.02314,0.02544,0.00836,0.01842,0.002918,14.98,17.13,101.1,686.6,0.1376,0.2698,0.2577,0.0909,0.3065,0.08177
-8812844,B,10.18,17.53,65.12,313.1,0.1061,0.08502,0.01768,0.01915,0.191,0.06908,0.2467,1.217,1.641,15.05,0.007899,0.014,0.008534,0.007624,0.02637,0.003761,11.17,22.84,71.94,375.6,0.1406,0.144,0.06572,0.05575,0.3055,0.08797
-8812877,M,15.75,20.25,102.6,761.3,0.1025,0.1204,0.1147,0.06462,0.1935,0.06303,0.3473,0.9209,2.244,32.19,0.004766,0.02374,0.02384,0.008637,0.01772,0.003131,19.56,30.29,125.9,1088,0.1552,0.448,0.3976,0.1479,0.3993,0.1064
-8813129,B,13.27,17.02,84.55,546.4,0.08445,0.04994,0.03554,0.02456,0.1496,0.05674,0.2927,0.8907,2.044,24.68,0.006032,0.01104,0.02259,0.009057,0.01482,0.002496,15.14,23.6,98.84,708.8,0.1276,0.1311,0.1786,0.09678,0.2506,0.07623
-88143502,B,14.34,13.47,92.51,641.2,0.09906,0.07624,0.05724,0.04603,0.2075,0.05448,0.522,0.8121,3.763,48.29,0.007089,0.01428,0.0236,0.01286,0.02266,0.001463,16.77,16.9,110.4,873.2,0.1297,0.1525,0.1632,0.1087,0.3062,0.06072
-88147101,B,10.44,15.46,66.62,329.6,0.1053,0.07722,0.006643,0.01216,0.1788,0.0645,0.1913,0.9027,1.208,11.86,0.006513,0.008061,0.002817,0.004972,0.01502,0.002821,11.52,19.8,73.47,395.4,0.1341,0.1153,0.02639,0.04464,0.2615,0.08269
-88147102,B,15,15.51,97.45,684.5,0.08371,0.1096,0.06505,0.0378,0.1881,0.05907,0.2318,0.4966,2.276,19.88,0.004119,0.03207,0.03644,0.01155,0.01391,0.003204,16.41,19.31,114.2,808.2,0.1136,0.3627,0.3402,0.1379,0.2954,0.08362
-88147202,B,12.62,23.97,81.35,496.4,0.07903,0.07529,0.05438,0.02036,0.1514,0.06019,0.2449,1.066,1.445,18.51,0.005169,0.02294,0.03016,0.008691,0.01365,0.003407,14.2,31.31,90.67,624,0.1227,0.3454,0.3911,0.118,0.2826,0.09585
-881861,M,12.83,22.33,85.26,503.2,0.1088,0.1799,0.1695,0.06861,0.2123,0.07254,0.3061,1.069,2.257,25.13,0.006983,0.03858,0.04683,0.01499,0.0168,0.005617,15.2,30.15,105.3,706,0.1777,0.5343,0.6282,0.1977,0.3407,0.1243
-881972,M,17.05,19.08,113.4,895,0.1141,0.1572,0.191,0.109,0.2131,0.06325,0.2959,0.679,2.153,31.98,0.005532,0.02008,0.03055,0.01384,0.01177,0.002336,19.59,24.89,133.5,1189,0.1703,0.3934,0.5018,0.2543,0.3109,0.09061
-88199202,B,11.32,27.08,71.76,395.7,0.06883,0.03813,0.01633,0.003125,0.1869,0.05628,0.121,0.8927,1.059,8.605,0.003653,0.01647,0.01633,0.003125,0.01537,0.002052,12.08,33.75,79.82,452.3,0.09203,0.1432,0.1089,0.02083,0.2849,0.07087
-88203002,B,11.22,33.81,70.79,386.8,0.0778,0.03574,0.004967,0.006434,0.1845,0.05828,0.2239,1.647,1.489,15.46,0.004359,0.006813,0.003223,0.003419,0.01916,0.002534,12.36,41.78,78.44,470.9,0.09994,0.06885,0.02318,0.03002,0.2911,0.07307
-88206102,M,20.51,27.81,134.4,1319,0.09159,0.1074,0.1554,0.0834,0.1448,0.05592,0.524,1.189,3.767,70.01,0.00502,0.02062,0.03457,0.01091,0.01298,0.002887,24.47,37.38,162.7,1872,0.1223,0.2761,0.4146,0.1563,0.2437,0.08328
-882488,B,9.567,15.91,60.21,279.6,0.08464,0.04087,0.01652,0.01667,0.1551,0.06403,0.2152,0.8301,1.215,12.64,0.01164,0.0104,0.01186,0.009623,0.02383,0.00354,10.51,19.16,65.74,335.9,0.1504,0.09515,0.07161,0.07222,0.2757,0.08178
-88249602,B,14.03,21.25,89.79,603.4,0.0907,0.06945,0.01462,0.01896,0.1517,0.05835,0.2589,1.503,1.667,22.07,0.007389,0.01383,0.007302,0.01004,0.01263,0.002925,15.33,30.28,98.27,715.5,0.1287,0.1513,0.06231,0.07963,0.2226,0.07617
-88299702,M,23.21,26.97,153.5,1670,0.09509,0.1682,0.195,0.1237,0.1909,0.06309,1.058,0.9635,7.247,155.8,0.006428,0.02863,0.04497,0.01716,0.0159,0.003053,31.01,34.51,206,2944,0.1481,0.4126,0.582,0.2593,0.3103,0.08677
-883263,M,20.48,21.46,132.5,1306,0.08355,0.08348,0.09042,0.06022,0.1467,0.05177,0.6874,1.041,5.144,83.5,0.007959,0.03133,0.04257,0.01671,0.01341,0.003933,24.22,26.17,161.7,1750,0.1228,0.2311,0.3158,0.1445,0.2238,0.07127
-883270,B,14.22,27.85,92.55,623.9,0.08223,0.1039,0.1103,0.04408,0.1342,0.06129,0.3354,2.324,2.105,29.96,0.006307,0.02845,0.0385,0.01011,0.01185,0.003589,15.75,40.54,102.5,764,0.1081,0.2426,0.3064,0.08219,0.189,0.07796
-88330202,M,17.46,39.28,113.4,920.6,0.09812,0.1298,0.1417,0.08811,0.1809,0.05966,0.5366,0.8561,3.002,49,0.00486,0.02785,0.02602,0.01374,0.01226,0.002759,22.51,44.87,141.2,1408,0.1365,0.3735,0.3241,0.2066,0.2853,0.08496
-88350402,B,13.64,15.6,87.38,575.3,0.09423,0.0663,0.04705,0.03731,0.1717,0.0566,0.3242,0.6612,1.996,27.19,0.00647,0.01248,0.0181,0.01103,0.01898,0.001794,14.85,19.05,94.11,683.4,0.1278,0.1291,0.1533,0.09222,0.253,0.0651
-883539,B,12.42,15.04,78.61,476.5,0.07926,0.03393,0.01053,0.01108,0.1546,0.05754,0.1153,0.6745,0.757,9.006,0.003265,0.00493,0.006493,0.003762,0.0172,0.00136,13.2,20.37,83.85,543.4,0.1037,0.07776,0.06243,0.04052,0.2901,0.06783
-883852,B,11.3,18.19,73.93,389.4,0.09592,0.1325,0.1548,0.02854,0.2054,0.07669,0.2428,1.642,2.369,16.39,0.006663,0.05914,0.0888,0.01314,0.01995,0.008675,12.58,27.96,87.16,472.9,0.1347,0.4848,0.7436,0.1218,0.3308,0.1297
-88411702,B,13.75,23.77,88.54,590,0.08043,0.06807,0.04697,0.02344,0.1773,0.05429,0.4347,1.057,2.829,39.93,0.004351,0.02667,0.03371,0.01007,0.02598,0.003087,15.01,26.34,98,706,0.09368,0.1442,0.1359,0.06106,0.2663,0.06321
-884180,M,19.4,23.5,129.1,1155,0.1027,0.1558,0.2049,0.08886,0.1978,0.06,0.5243,1.802,4.037,60.41,0.01061,0.03252,0.03915,0.01559,0.02186,0.003949,21.65,30.53,144.9,1417,0.1463,0.2968,0.3458,0.1564,0.292,0.07614
-884437,B,10.48,19.86,66.72,337.7,0.107,0.05971,0.04831,0.0307,0.1737,0.0644,0.3719,2.612,2.517,23.22,0.01604,0.01386,0.01865,0.01133,0.03476,0.00356,11.48,29.46,73.68,402.8,0.1515,0.1026,0.1181,0.06736,0.2883,0.07748
-884448,B,13.2,17.43,84.13,541.6,0.07215,0.04524,0.04336,0.01105,0.1487,0.05635,0.163,1.601,0.873,13.56,0.006261,0.01569,0.03079,0.005383,0.01962,0.00225,13.94,27.82,88.28,602,0.1101,0.1508,0.2298,0.0497,0.2767,0.07198
-884626,B,12.89,14.11,84.95,512.2,0.0876,0.1346,0.1374,0.0398,0.1596,0.06409,0.2025,0.4402,2.393,16.35,0.005501,0.05592,0.08158,0.0137,0.01266,0.007555,14.39,17.7,105,639.1,0.1254,0.5849,0.7727,0.1561,0.2639,0.1178
-88466802,B,10.65,25.22,68.01,347,0.09657,0.07234,0.02379,0.01615,0.1897,0.06329,0.2497,1.493,1.497,16.64,0.007189,0.01035,0.01081,0.006245,0.02158,0.002619,12.25,35.19,77.98,455.7,0.1499,0.1398,0.1125,0.06136,0.3409,0.08147
-884689,B,11.52,14.93,73.87,406.3,0.1013,0.07808,0.04328,0.02929,0.1883,0.06168,0.2562,1.038,1.686,18.62,0.006662,0.01228,0.02105,0.01006,0.01677,0.002784,12.65,21.19,80.88,491.8,0.1389,0.1582,0.1804,0.09608,0.2664,0.07809
-884948,M,20.94,23.56,138.9,1364,0.1007,0.1606,0.2712,0.131,0.2205,0.05898,1.004,0.8208,6.372,137.9,0.005283,0.03908,0.09518,0.01864,0.02401,0.005002,25.58,27,165.3,2010,0.1211,0.3172,0.6991,0.2105,0.3126,0.07849
-88518501,B,11.5,18.45,73.28,407.4,0.09345,0.05991,0.02638,0.02069,0.1834,0.05934,0.3927,0.8429,2.684,26.99,0.00638,0.01065,0.01245,0.009175,0.02292,0.001461,12.97,22.46,83.12,508.9,0.1183,0.1049,0.08105,0.06544,0.274,0.06487
-885429,M,19.73,19.82,130.7,1206,0.1062,0.1849,0.2417,0.0974,0.1733,0.06697,0.7661,0.78,4.115,92.81,0.008482,0.05057,0.068,0.01971,0.01467,0.007259,25.28,25.59,159.8,1933,0.171,0.5955,0.8489,0.2507,0.2749,0.1297
-8860702,M,17.3,17.08,113,928.2,0.1008,0.1041,0.1266,0.08353,0.1813,0.05613,0.3093,0.8568,2.193,33.63,0.004757,0.01503,0.02332,0.01262,0.01394,0.002362,19.85,25.09,130.9,1222,0.1416,0.2405,0.3378,0.1857,0.3138,0.08113
-886226,M,19.45,19.33,126.5,1169,0.1035,0.1188,0.1379,0.08591,0.1776,0.05647,0.5959,0.6342,3.797,71,0.004649,0.018,0.02749,0.01267,0.01365,0.00255,25.7,24.57,163.1,1972,0.1497,0.3161,0.4317,0.1999,0.3379,0.0895
-886452,M,13.96,17.05,91.43,602.4,0.1096,0.1279,0.09789,0.05246,0.1908,0.0613,0.425,0.8098,2.563,35.74,0.006351,0.02679,0.03119,0.01342,0.02062,0.002695,16.39,22.07,108.1,826,0.1512,0.3262,0.3209,0.1374,0.3068,0.07957
-88649001,M,19.55,28.77,133.6,1207,0.0926,0.2063,0.1784,0.1144,0.1893,0.06232,0.8426,1.199,7.158,106.4,0.006356,0.04765,0.03863,0.01519,0.01936,0.005252,25.05,36.27,178.6,1926,0.1281,0.5329,0.4251,0.1941,0.2818,0.1005
-886776,M,15.32,17.27,103.2,713.3,0.1335,0.2284,0.2448,0.1242,0.2398,0.07596,0.6592,1.059,4.061,59.46,0.01015,0.04588,0.04983,0.02127,0.01884,0.00866,17.73,22.66,119.8,928.8,0.1765,0.4503,0.4429,0.2229,0.3258,0.1191
-887181,M,15.66,23.2,110.2,773.5,0.1109,0.3114,0.3176,0.1377,0.2495,0.08104,1.292,2.454,10.12,138.5,0.01236,0.05995,0.08232,0.03024,0.02337,0.006042,19.85,31.64,143.7,1226,0.1504,0.5172,0.6181,0.2462,0.3277,0.1019
-88725602,M,15.53,33.56,103.7,744.9,0.1063,0.1639,0.1751,0.08399,0.2091,0.0665,0.2419,1.278,1.903,23.02,0.005345,0.02556,0.02889,0.01022,0.009947,0.003359,18.49,49.54,126.3,1035,0.1883,0.5564,0.5703,0.2014,0.3512,0.1204
-887549,M,20.31,27.06,132.9,1288,0.1,0.1088,0.1519,0.09333,0.1814,0.05572,0.3977,1.033,2.587,52.34,0.005043,0.01578,0.02117,0.008185,0.01282,0.001892,24.33,39.16,162.3,1844,0.1522,0.2945,0.3788,0.1697,0.3151,0.07999
-888264,M,17.35,23.06,111,933.1,0.08662,0.0629,0.02891,0.02837,0.1564,0.05307,0.4007,1.317,2.577,44.41,0.005726,0.01106,0.01246,0.007671,0.01411,0.001578,19.85,31.47,128.2,1218,0.124,0.1486,0.1211,0.08235,0.2452,0.06515
-888570,M,17.29,22.13,114.4,947.8,0.08999,0.1273,0.09697,0.07507,0.2108,0.05464,0.8348,1.633,6.146,90.94,0.006717,0.05981,0.04638,0.02149,0.02747,0.005838,20.39,27.24,137.9,1295,0.1134,0.2867,0.2298,0.1528,0.3067,0.07484
-889403,M,15.61,19.38,100,758.6,0.0784,0.05616,0.04209,0.02847,0.1547,0.05443,0.2298,0.9988,1.534,22.18,0.002826,0.009105,0.01311,0.005174,0.01013,0.001345,17.91,31.67,115.9,988.6,0.1084,0.1807,0.226,0.08568,0.2683,0.06829
-889719,M,17.19,22.07,111.6,928.3,0.09726,0.08995,0.09061,0.06527,0.1867,0.0558,0.4203,0.7383,2.819,45.42,0.004493,0.01206,0.02048,0.009875,0.01144,0.001575,21.58,29.33,140.5,1436,0.1558,0.2567,0.3889,0.1984,0.3216,0.0757
-88995002,M,20.73,31.12,135.7,1419,0.09469,0.1143,0.1367,0.08646,0.1769,0.05674,1.172,1.617,7.749,199.7,0.004551,0.01478,0.02143,0.00928,0.01367,0.002299,32.49,47.16,214,3432,0.1401,0.2644,0.3442,0.1659,0.2868,0.08218
-8910251,B,10.6,18.95,69.28,346.4,0.09688,0.1147,0.06387,0.02642,0.1922,0.06491,0.4505,1.197,3.43,27.1,0.00747,0.03581,0.03354,0.01365,0.03504,0.003318,11.88,22.94,78.28,424.8,0.1213,0.2515,0.1916,0.07926,0.294,0.07587
-8910499,B,13.59,21.84,87.16,561,0.07956,0.08259,0.04072,0.02142,0.1635,0.05859,0.338,1.916,2.591,26.76,0.005436,0.02406,0.03099,0.009919,0.0203,0.003009,14.8,30.04,97.66,661.5,0.1005,0.173,0.1453,0.06189,0.2446,0.07024
-8910506,B,12.87,16.21,82.38,512.2,0.09425,0.06219,0.039,0.01615,0.201,0.05769,0.2345,1.219,1.546,18.24,0.005518,0.02178,0.02589,0.00633,0.02593,0.002157,13.9,23.64,89.27,597.5,0.1256,0.1808,0.1992,0.0578,0.3604,0.07062
-8910720,B,10.71,20.39,69.5,344.9,0.1082,0.1289,0.08448,0.02867,0.1668,0.06862,0.3198,1.489,2.23,20.74,0.008902,0.04785,0.07339,0.01745,0.02728,0.00761,11.69,25.21,76.51,410.4,0.1335,0.255,0.2534,0.086,0.2605,0.08701
-8910721,B,14.29,16.82,90.3,632.6,0.06429,0.02675,0.00725,0.00625,0.1508,0.05376,0.1302,0.7198,0.8439,10.77,0.003492,0.00371,0.004826,0.003608,0.01536,0.001381,14.91,20.65,94.44,684.6,0.08567,0.05036,0.03866,0.03333,0.2458,0.0612
-8910748,B,11.29,13.04,72.23,388,0.09834,0.07608,0.03265,0.02755,0.1769,0.0627,0.1904,0.5293,1.164,13.17,0.006472,0.01122,0.01282,0.008849,0.01692,0.002817,12.32,16.18,78.27,457.5,0.1358,0.1507,0.1275,0.0875,0.2733,0.08022
-8910988,M,21.75,20.99,147.3,1491,0.09401,0.1961,0.2195,0.1088,0.1721,0.06194,1.167,1.352,8.867,156.8,0.005687,0.0496,0.06329,0.01561,0.01924,0.004614,28.19,28.18,195.9,2384,0.1272,0.4725,0.5807,0.1841,0.2833,0.08858
-8910996,B,9.742,15.67,61.5,289.9,0.09037,0.04689,0.01103,0.01407,0.2081,0.06312,0.2684,1.409,1.75,16.39,0.0138,0.01067,0.008347,0.009472,0.01798,0.004261,10.75,20.88,68.09,355.2,0.1467,0.0937,0.04043,0.05159,0.2841,0.08175
-8911163,M,17.93,24.48,115.2,998.9,0.08855,0.07027,0.05699,0.04744,0.1538,0.0551,0.4212,1.433,2.765,45.81,0.005444,0.01169,0.01622,0.008522,0.01419,0.002751,20.92,34.69,135.1,1320,0.1315,0.1806,0.208,0.1136,0.2504,0.07948
-8911164,B,11.89,17.36,76.2,435.6,0.1225,0.0721,0.05929,0.07404,0.2015,0.05875,0.6412,2.293,4.021,48.84,0.01418,0.01489,0.01267,0.0191,0.02678,0.003002,12.4,18.99,79.46,472.4,0.1359,0.08368,0.07153,0.08946,0.222,0.06033
-8911230,B,11.33,14.16,71.79,396.6,0.09379,0.03872,0.001487,0.003333,0.1954,0.05821,0.2375,1.28,1.565,17.09,0.008426,0.008998,0.001487,0.003333,0.02358,0.001627,12.2,18.99,77.37,458,0.1259,0.07348,0.004955,0.01111,0.2758,0.06386
-8911670,M,18.81,19.98,120.9,1102,0.08923,0.05884,0.0802,0.05843,0.155,0.04996,0.3283,0.828,2.363,36.74,0.007571,0.01114,0.02623,0.01463,0.0193,0.001676,19.96,24.3,129,1236,0.1243,0.116,0.221,0.1294,0.2567,0.05737
-8911800,B,13.59,17.84,86.24,572.3,0.07948,0.04052,0.01997,0.01238,0.1573,0.0552,0.258,1.166,1.683,22.22,0.003741,0.005274,0.01065,0.005044,0.01344,0.001126,15.5,26.1,98.91,739.1,0.105,0.07622,0.106,0.05185,0.2335,0.06263
-8911834,B,13.85,15.18,88.99,587.4,0.09516,0.07688,0.04479,0.03711,0.211,0.05853,0.2479,0.9195,1.83,19.41,0.004235,0.01541,0.01457,0.01043,0.01528,0.001593,14.98,21.74,98.37,670,0.1185,0.1724,0.1456,0.09993,0.2955,0.06912
-8912049,M,19.16,26.6,126.2,1138,0.102,0.1453,0.1921,0.09664,0.1902,0.0622,0.6361,1.001,4.321,69.65,0.007392,0.02449,0.03988,0.01293,0.01435,0.003446,23.72,35.9,159.8,1724,0.1782,0.3841,0.5754,0.1872,0.3258,0.0972
-8912055,B,11.74,14.02,74.24,427.3,0.07813,0.0434,0.02245,0.02763,0.2101,0.06113,0.5619,1.268,3.717,37.83,0.008034,0.01442,0.01514,0.01846,0.02921,0.002005,13.31,18.26,84.7,533.7,0.1036,0.085,0.06735,0.0829,0.3101,0.06688
-89122,M,19.4,18.18,127.2,1145,0.1037,0.1442,0.1626,0.09464,0.1893,0.05892,0.4709,0.9951,2.903,53.16,0.005654,0.02199,0.03059,0.01499,0.01623,0.001965,23.79,28.65,152.4,1628,0.1518,0.3749,0.4316,0.2252,0.359,0.07787
-8912280,M,16.24,18.77,108.8,805.1,0.1066,0.1802,0.1948,0.09052,0.1876,0.06684,0.2873,0.9173,2.464,28.09,0.004563,0.03481,0.03872,0.01209,0.01388,0.004081,18.55,25.09,126.9,1031,0.1365,0.4706,0.5026,0.1732,0.277,0.1063
-8912284,B,12.89,15.7,84.08,516.6,0.07818,0.0958,0.1115,0.0339,0.1432,0.05935,0.2913,1.389,2.347,23.29,0.006418,0.03961,0.07927,0.01774,0.01878,0.003696,13.9,19.69,92.12,595.6,0.09926,0.2317,0.3344,0.1017,0.1999,0.07127
-8912521,B,12.58,18.4,79.83,489,0.08393,0.04216,0.00186,0.002924,0.1697,0.05855,0.2719,1.35,1.721,22.45,0.006383,0.008008,0.00186,0.002924,0.02571,0.002015,13.5,23.08,85.56,564.1,0.1038,0.06624,0.005579,0.008772,0.2505,0.06431
-8912909,B,11.94,20.76,77.87,441,0.08605,0.1011,0.06574,0.03791,0.1588,0.06766,0.2742,1.39,3.198,21.91,0.006719,0.05156,0.04387,0.01633,0.01872,0.008015,13.24,27.29,92.2,546.1,0.1116,0.2813,0.2365,0.1155,0.2465,0.09981
-8913,B,12.89,13.12,81.89,515.9,0.06955,0.03729,0.0226,0.01171,0.1337,0.05581,0.1532,0.469,1.115,12.68,0.004731,0.01345,0.01652,0.005905,0.01619,0.002081,13.62,15.54,87.4,577,0.09616,0.1147,0.1186,0.05366,0.2309,0.06915
-8913049,B,11.26,19.96,73.72,394.1,0.0802,0.1181,0.09274,0.05588,0.2595,0.06233,0.4866,1.905,2.877,34.68,0.01574,0.08262,0.08099,0.03487,0.03418,0.006517,11.86,22.33,78.27,437.6,0.1028,0.1843,0.1546,0.09314,0.2955,0.07009
-89143601,B,11.37,18.89,72.17,396,0.08713,0.05008,0.02399,0.02173,0.2013,0.05955,0.2656,1.974,1.954,17.49,0.006538,0.01395,0.01376,0.009924,0.03416,0.002928,12.36,26.14,79.29,459.3,0.1118,0.09708,0.07529,0.06203,0.3267,0.06994
-89143602,B,14.41,19.73,96.03,651,0.08757,0.1676,0.1362,0.06602,0.1714,0.07192,0.8811,1.77,4.36,77.11,0.007762,0.1064,0.0996,0.02771,0.04077,0.02286,15.77,22.13,101.7,767.3,0.09983,0.2472,0.222,0.1021,0.2272,0.08799
-8915,B,14.96,19.1,97.03,687.3,0.08992,0.09823,0.0594,0.04819,0.1879,0.05852,0.2877,0.948,2.171,24.87,0.005332,0.02115,0.01536,0.01187,0.01522,0.002815,16.25,26.19,109.1,809.8,0.1313,0.303,0.1804,0.1489,0.2962,0.08472
-891670,B,12.95,16.02,83.14,513.7,0.1005,0.07943,0.06155,0.0337,0.173,0.0647,0.2094,0.7636,1.231,17.67,0.008725,0.02003,0.02335,0.01132,0.02625,0.004726,13.74,19.93,88.81,585.4,0.1483,0.2068,0.2241,0.1056,0.338,0.09584
-891703,B,11.85,17.46,75.54,432.7,0.08372,0.05642,0.02688,0.0228,0.1875,0.05715,0.207,1.238,1.234,13.88,0.007595,0.015,0.01412,0.008578,0.01792,0.001784,13.06,25.75,84.35,517.8,0.1369,0.1758,0.1316,0.0914,0.3101,0.07007
-891716,B,12.72,13.78,81.78,492.1,0.09667,0.08393,0.01288,0.01924,0.1638,0.061,0.1807,0.6931,1.34,13.38,0.006064,0.0118,0.006564,0.007978,0.01374,0.001392,13.5,17.48,88.54,553.7,0.1298,0.1472,0.05233,0.06343,0.2369,0.06922
-891923,B,13.77,13.27,88.06,582.7,0.09198,0.06221,0.01063,0.01917,0.1592,0.05912,0.2191,0.6946,1.479,17.74,0.004348,0.008153,0.004272,0.006829,0.02154,0.001802,14.67,16.93,94.17,661.1,0.117,0.1072,0.03732,0.05802,0.2823,0.06794
-891936,B,10.91,12.35,69.14,363.7,0.08518,0.04721,0.01236,0.01369,0.1449,0.06031,0.1753,1.027,1.267,11.09,0.003478,0.01221,0.01072,0.009393,0.02941,0.003428,11.37,14.82,72.42,392.2,0.09312,0.07506,0.02884,0.03194,0.2143,0.06643
-892189,M,11.76,18.14,75,431.1,0.09968,0.05914,0.02685,0.03515,0.1619,0.06287,0.645,2.105,4.138,49.11,0.005596,0.01005,0.01272,0.01432,0.01575,0.002758,13.36,23.39,85.1,553.6,0.1137,0.07974,0.0612,0.0716,0.1978,0.06915
-892214,B,14.26,18.17,91.22,633.1,0.06576,0.0522,0.02475,0.01374,0.1635,0.05586,0.23,0.669,1.661,20.56,0.003169,0.01377,0.01079,0.005243,0.01103,0.001957,16.22,25.26,105.8,819.7,0.09445,0.2167,0.1565,0.0753,0.2636,0.07676
-892399,B,10.51,23.09,66.85,334.2,0.1015,0.06797,0.02495,0.01875,0.1695,0.06556,0.2868,1.143,2.289,20.56,0.01017,0.01443,0.01861,0.0125,0.03464,0.001971,10.93,24.22,70.1,362.7,0.1143,0.08614,0.04158,0.03125,0.2227,0.06777
-892438,M,19.53,18.9,129.5,1217,0.115,0.1642,0.2197,0.1062,0.1792,0.06552,1.111,1.161,7.237,133,0.006056,0.03203,0.05638,0.01733,0.01884,0.004787,25.93,26.24,171.1,2053,0.1495,0.4116,0.6121,0.198,0.2968,0.09929
-892604,B,12.46,19.89,80.43,471.3,0.08451,0.1014,0.0683,0.03099,0.1781,0.06249,0.3642,1.04,2.579,28.32,0.00653,0.03369,0.04712,0.01403,0.0274,0.004651,13.46,23.07,88.13,551.3,0.105,0.2158,0.1904,0.07625,0.2685,0.07764
-89263202,M,20.09,23.86,134.7,1247,0.108,0.1838,0.2283,0.128,0.2249,0.07469,1.072,1.743,7.804,130.8,0.007964,0.04732,0.07649,0.01936,0.02736,0.005928,23.68,29.43,158.8,1696,0.1347,0.3391,0.4932,0.1923,0.3294,0.09469
-892657,B,10.49,18.61,66.86,334.3,0.1068,0.06678,0.02297,0.0178,0.1482,0.066,0.1485,1.563,1.035,10.08,0.008875,0.009362,0.01808,0.009199,0.01791,0.003317,11.06,24.54,70.76,375.4,0.1413,0.1044,0.08423,0.06528,0.2213,0.07842
-89296,B,11.46,18.16,73.59,403.1,0.08853,0.07694,0.03344,0.01502,0.1411,0.06243,0.3278,1.059,2.475,22.93,0.006652,0.02652,0.02221,0.007807,0.01894,0.003411,12.68,21.61,82.69,489.8,0.1144,0.1789,0.1226,0.05509,0.2208,0.07638
-893061,B,11.6,24.49,74.23,417.2,0.07474,0.05688,0.01974,0.01313,0.1935,0.05878,0.2512,1.786,1.961,18.21,0.006122,0.02337,0.01596,0.006998,0.03194,0.002211,12.44,31.62,81.39,476.5,0.09545,0.1361,0.07239,0.04815,0.3244,0.06745
-89344,B,13.2,15.82,84.07,537.3,0.08511,0.05251,0.001461,0.003261,0.1632,0.05894,0.1903,0.5735,1.204,15.5,0.003632,0.007861,0.001128,0.002386,0.01344,0.002585,14.41,20.45,92,636.9,0.1128,0.1346,0.0112,0.025,0.2651,0.08385
-89346,B,9,14.4,56.36,246.3,0.07005,0.03116,0.003681,0.003472,0.1788,0.06833,0.1746,1.305,1.144,9.789,0.007389,0.004883,0.003681,0.003472,0.02701,0.002153,9.699,20.07,60.9,285.5,0.09861,0.05232,0.01472,0.01389,0.2991,0.07804
-893526,B,13.5,12.71,85.69,566.2,0.07376,0.03614,0.002758,0.004419,0.1365,0.05335,0.2244,0.6864,1.509,20.39,0.003338,0.003746,0.00203,0.003242,0.0148,0.001566,14.97,16.94,95.48,698.7,0.09023,0.05836,0.01379,0.0221,0.2267,0.06192
-893548,B,13.05,13.84,82.71,530.6,0.08352,0.03735,0.004559,0.008829,0.1453,0.05518,0.3975,0.8285,2.567,33.01,0.004148,0.004711,0.002831,0.004821,0.01422,0.002273,14.73,17.4,93.96,672.4,0.1016,0.05847,0.01824,0.03532,0.2107,0.0658
-893783,B,11.7,19.11,74.33,418.7,0.08814,0.05253,0.01583,0.01148,0.1936,0.06128,0.1601,1.43,1.109,11.28,0.006064,0.00911,0.01042,0.007638,0.02349,0.001661,12.61,26.55,80.92,483.1,0.1223,0.1087,0.07915,0.05741,0.3487,0.06958
-89382601,B,14.61,15.69,92.68,664.9,0.07618,0.03515,0.01447,0.01877,0.1632,0.05255,0.316,0.9115,1.954,28.9,0.005031,0.006021,0.005325,0.006324,0.01494,0.0008948,16.46,21.75,103.7,840.8,0.1011,0.07087,0.04746,0.05813,0.253,0.05695
-89382602,B,12.76,13.37,82.29,504.1,0.08794,0.07948,0.04052,0.02548,0.1601,0.0614,0.3265,0.6594,2.346,25.18,0.006494,0.02768,0.03137,0.01069,0.01731,0.004392,14.19,16.4,92.04,618.8,0.1194,0.2208,0.1769,0.08411,0.2564,0.08253
-893988,B,11.54,10.72,73.73,409.1,0.08597,0.05969,0.01367,0.008907,0.1833,0.061,0.1312,0.3602,1.107,9.438,0.004124,0.0134,0.01003,0.004667,0.02032,0.001952,12.34,12.87,81.23,467.8,0.1092,0.1626,0.08324,0.04715,0.339,0.07434
-894047,B,8.597,18.6,54.09,221.2,0.1074,0.05847,0,0,0.2163,0.07359,0.3368,2.777,2.222,17.81,0.02075,0.01403,0,0,0.06146,0.00682,8.952,22.44,56.65,240.1,0.1347,0.07767,0,0,0.3142,0.08116
-894089,B,12.49,16.85,79.19,481.6,0.08511,0.03834,0.004473,0.006423,0.1215,0.05673,0.1716,0.7151,1.047,12.69,0.004928,0.003012,0.00262,0.00339,0.01393,0.001344,13.34,19.71,84.48,544.2,0.1104,0.04953,0.01938,0.02784,0.1917,0.06174
-894090,B,12.18,14.08,77.25,461.4,0.07734,0.03212,0.01123,0.005051,0.1673,0.05649,0.2113,0.5996,1.438,15.82,0.005343,0.005767,0.01123,0.005051,0.01977,0.0009502,12.85,16.47,81.6,513.1,0.1001,0.05332,0.04116,0.01852,0.2293,0.06037
-894326,M,18.22,18.87,118.7,1027,0.09746,0.1117,0.113,0.0795,0.1807,0.05664,0.4041,0.5503,2.547,48.9,0.004821,0.01659,0.02408,0.01143,0.01275,0.002451,21.84,25,140.9,1485,0.1434,0.2763,0.3853,0.1776,0.2812,0.08198
-894329,B,9.042,18.9,60.07,244.5,0.09968,0.1972,0.1975,0.04908,0.233,0.08743,0.4653,1.911,3.769,24.2,0.009845,0.0659,0.1027,0.02527,0.03491,0.007877,10.06,23.4,68.62,297.1,0.1221,0.3748,0.4609,0.1145,0.3135,0.1055
-894335,B,12.43,17,78.6,477.3,0.07557,0.03454,0.01342,0.01699,0.1472,0.05561,0.3778,2.2,2.487,31.16,0.007357,0.01079,0.009959,0.0112,0.03433,0.002961,12.9,20.21,81.76,515.9,0.08409,0.04712,0.02237,0.02832,0.1901,0.05932
-894604,B,10.25,16.18,66.52,324.2,0.1061,0.1111,0.06726,0.03965,0.1743,0.07279,0.3677,1.471,1.597,22.68,0.01049,0.04265,0.04004,0.01544,0.02719,0.007596,11.28,20.61,71.53,390.4,0.1402,0.236,0.1898,0.09744,0.2608,0.09702
-894618,M,20.16,19.66,131.1,1274,0.0802,0.08564,0.1155,0.07726,0.1928,0.05096,0.5925,0.6863,3.868,74.85,0.004536,0.01376,0.02645,0.01247,0.02193,0.001589,23.06,23.03,150.2,1657,0.1054,0.1537,0.2606,0.1425,0.3055,0.05933
-894855,B,12.86,13.32,82.82,504.8,0.1134,0.08834,0.038,0.034,0.1543,0.06476,0.2212,1.042,1.614,16.57,0.00591,0.02016,0.01902,0.01011,0.01202,0.003107,14.04,21.08,92.8,599.5,0.1547,0.2231,0.1791,0.1155,0.2382,0.08553
-895100,M,20.34,21.51,135.9,1264,0.117,0.1875,0.2565,0.1504,0.2569,0.0667,0.5702,1.023,4.012,69.06,0.005485,0.02431,0.0319,0.01369,0.02768,0.003345,25.3,31.86,171.1,1938,0.1592,0.4492,0.5344,0.2685,0.5558,0.1024
-89511501,B,12.2,15.21,78.01,457.9,0.08673,0.06545,0.01994,0.01692,0.1638,0.06129,0.2575,0.8073,1.959,19.01,0.005403,0.01418,0.01051,0.005142,0.01333,0.002065,13.75,21.38,91.11,583.1,0.1256,0.1928,0.1167,0.05556,0.2661,0.07961
-89511502,B,12.67,17.3,81.25,489.9,0.1028,0.07664,0.03193,0.02107,0.1707,0.05984,0.21,0.9505,1.566,17.61,0.006809,0.009514,0.01329,0.006474,0.02057,0.001784,13.71,21.1,88.7,574.4,0.1384,0.1212,0.102,0.05602,0.2688,0.06888
-89524,B,14.11,12.88,90.03,616.5,0.09309,0.05306,0.01765,0.02733,0.1373,0.057,0.2571,1.081,1.558,23.92,0.006692,0.01132,0.005717,0.006627,0.01416,0.002476,15.53,18,98.4,749.9,0.1281,0.1109,0.05307,0.0589,0.21,0.07083
-895299,B,12.03,17.93,76.09,446,0.07683,0.03892,0.001546,0.005592,0.1382,0.0607,0.2335,0.9097,1.466,16.97,0.004729,0.006887,0.001184,0.003951,0.01466,0.001755,13.07,22.25,82.74,523.4,0.1013,0.0739,0.007732,0.02796,0.2171,0.07037
-8953902,M,16.27,20.71,106.9,813.7,0.1169,0.1319,0.1478,0.08488,0.1948,0.06277,0.4375,1.232,3.27,44.41,0.006697,0.02083,0.03248,0.01392,0.01536,0.002789,19.28,30.38,129.8,1121,0.159,0.2947,0.3597,0.1583,0.3103,0.082
-895633,M,16.26,21.88,107.5,826.8,0.1165,0.1283,0.1799,0.07981,0.1869,0.06532,0.5706,1.457,2.961,57.72,0.01056,0.03756,0.05839,0.01186,0.04022,0.006187,17.73,25.21,113.7,975.2,0.1426,0.2116,0.3344,0.1047,0.2736,0.07953
-896839,M,16.03,15.51,105.8,793.2,0.09491,0.1371,0.1204,0.07041,0.1782,0.05976,0.3371,0.7476,2.629,33.27,0.005839,0.03245,0.03715,0.01459,0.01467,0.003121,18.76,21.98,124.3,1070,0.1435,0.4478,0.4956,0.1981,0.3019,0.09124
-896864,B,12.98,19.35,84.52,514,0.09579,0.1125,0.07107,0.0295,0.1761,0.0654,0.2684,0.5664,2.465,20.65,0.005727,0.03255,0.04393,0.009811,0.02751,0.004572,14.42,21.95,99.21,634.3,0.1288,0.3253,0.3439,0.09858,0.3596,0.09166
-897132,B,11.22,19.86,71.94,387.3,0.1054,0.06779,0.005006,0.007583,0.194,0.06028,0.2976,1.966,1.959,19.62,0.01289,0.01104,0.003297,0.004967,0.04243,0.001963,11.98,25.78,76.91,436.1,0.1424,0.09669,0.01335,0.02022,0.3292,0.06522
-897137,B,11.25,14.78,71.38,390,0.08306,0.04458,0.0009737,0.002941,0.1773,0.06081,0.2144,0.9961,1.529,15.07,0.005617,0.007124,0.0009737,0.002941,0.017,0.00203,12.76,22.06,82.08,492.7,0.1166,0.09794,0.005518,0.01667,0.2815,0.07418
-897374,B,12.3,19.02,77.88,464.4,0.08313,0.04202,0.007756,0.008535,0.1539,0.05945,0.184,1.532,1.199,13.24,0.007881,0.008432,0.007004,0.006522,0.01939,0.002222,13.35,28.46,84.53,544.3,0.1222,0.09052,0.03619,0.03983,0.2554,0.07207
-89742801,M,17.06,21,111.8,918.6,0.1119,0.1056,0.1508,0.09934,0.1727,0.06071,0.8161,2.129,6.076,87.17,0.006455,0.01797,0.04502,0.01744,0.01829,0.003733,20.99,33.15,143.2,1362,0.1449,0.2053,0.392,0.1827,0.2623,0.07599
-897604,B,12.99,14.23,84.08,514.3,0.09462,0.09965,0.03738,0.02098,0.1652,0.07238,0.1814,0.6412,0.9219,14.41,0.005231,0.02305,0.03113,0.007315,0.01639,0.005701,13.72,16.91,87.38,576,0.1142,0.1975,0.145,0.0585,0.2432,0.1009
-897630,M,18.77,21.43,122.9,1092,0.09116,0.1402,0.106,0.0609,0.1953,0.06083,0.6422,1.53,4.369,88.25,0.007548,0.03897,0.03914,0.01816,0.02168,0.004445,24.54,34.37,161.1,1873,0.1498,0.4827,0.4634,0.2048,0.3679,0.0987
-897880,B,10.05,17.53,64.41,310.8,0.1007,0.07326,0.02511,0.01775,0.189,0.06331,0.2619,2.015,1.778,16.85,0.007803,0.01449,0.0169,0.008043,0.021,0.002778,11.16,26.84,71.98,384,0.1402,0.1402,0.1055,0.06499,0.2894,0.07664
-89812,M,23.51,24.27,155.1,1747,0.1069,0.1283,0.2308,0.141,0.1797,0.05506,1.009,0.9245,6.462,164.1,0.006292,0.01971,0.03582,0.01301,0.01479,0.003118,30.67,30.73,202.4,2906,0.1515,0.2678,0.4819,0.2089,0.2593,0.07738
-89813,B,14.42,16.54,94.15,641.2,0.09751,0.1139,0.08007,0.04223,0.1912,0.06412,0.3491,0.7706,2.677,32.14,0.004577,0.03053,0.0384,0.01243,0.01873,0.003373,16.67,21.51,111.4,862.1,0.1294,0.3371,0.3755,0.1414,0.3053,0.08764
-898143,B,9.606,16.84,61.64,280.5,0.08481,0.09228,0.08422,0.02292,0.2036,0.07125,0.1844,0.9429,1.429,12.07,0.005954,0.03471,0.05028,0.00851,0.0175,0.004031,10.75,23.07,71.25,353.6,0.1233,0.3416,0.4341,0.0812,0.2982,0.09825
-89827,B,11.06,14.96,71.49,373.9,0.1033,0.09097,0.05397,0.03341,0.1776,0.06907,0.1601,0.8225,1.355,10.8,0.007416,0.01877,0.02758,0.0101,0.02348,0.002917,11.92,19.9,79.76,440,0.1418,0.221,0.2299,0.1075,0.3301,0.0908
-898431,M,19.68,21.68,129.9,1194,0.09797,0.1339,0.1863,0.1103,0.2082,0.05715,0.6226,2.284,5.173,67.66,0.004756,0.03368,0.04345,0.01806,0.03756,0.003288,22.75,34.66,157.6,1540,0.1218,0.3458,0.4734,0.2255,0.4045,0.07918
-89864002,B,11.71,15.45,75.03,420.3,0.115,0.07281,0.04006,0.0325,0.2009,0.06506,0.3446,0.7395,2.355,24.53,0.009536,0.01097,0.01651,0.01121,0.01953,0.0031,13.06,18.16,84.16,516.4,0.146,0.1115,0.1087,0.07864,0.2765,0.07806
-898677,B,10.26,14.71,66.2,321.6,0.09882,0.09159,0.03581,0.02037,0.1633,0.07005,0.338,2.509,2.394,19.33,0.01736,0.04671,0.02611,0.01296,0.03675,0.006758,10.88,19.48,70.89,357.1,0.136,0.1636,0.07162,0.04074,0.2434,0.08488
-898678,B,12.06,18.9,76.66,445.3,0.08386,0.05794,0.00751,0.008488,0.1555,0.06048,0.243,1.152,1.559,18.02,0.00718,0.01096,0.005832,0.005495,0.01982,0.002754,13.64,27.06,86.54,562.6,0.1289,0.1352,0.04506,0.05093,0.288,0.08083
-89869,B,14.76,14.74,94.87,668.7,0.08875,0.0778,0.04608,0.03528,0.1521,0.05912,0.3428,0.3981,2.537,29.06,0.004732,0.01506,0.01855,0.01067,0.02163,0.002783,17.27,17.93,114.2,880.8,0.122,0.2009,0.2151,0.1251,0.3109,0.08187
-898690,B,11.47,16.03,73.02,402.7,0.09076,0.05886,0.02587,0.02322,0.1634,0.06372,0.1707,0.7615,1.09,12.25,0.009191,0.008548,0.0094,0.006315,0.01755,0.003009,12.51,20.79,79.67,475.8,0.1531,0.112,0.09823,0.06548,0.2851,0.08763
-899147,B,11.95,14.96,77.23,426.7,0.1158,0.1206,0.01171,0.01787,0.2459,0.06581,0.361,1.05,2.455,26.65,0.0058,0.02417,0.007816,0.01052,0.02734,0.003114,12.81,17.72,83.09,496.2,0.1293,0.1885,0.03122,0.04766,0.3124,0.0759
-899187,B,11.66,17.07,73.7,421,0.07561,0.0363,0.008306,0.01162,0.1671,0.05731,0.3534,0.6724,2.225,26.03,0.006583,0.006991,0.005949,0.006296,0.02216,0.002668,13.28,19.74,83.61,542.5,0.09958,0.06476,0.03046,0.04262,0.2731,0.06825
-899667,M,15.75,19.22,107.1,758.6,0.1243,0.2364,0.2914,0.1242,0.2375,0.07603,0.5204,1.324,3.477,51.22,0.009329,0.06559,0.09953,0.02283,0.05543,0.00733,17.36,24.17,119.4,915.3,0.155,0.5046,0.6872,0.2135,0.4245,0.105
-899987,M,25.73,17.46,174.2,2010,0.1149,0.2363,0.3368,0.1913,0.1956,0.06121,0.9948,0.8509,7.222,153.1,0.006369,0.04243,0.04266,0.01508,0.02335,0.003385,33.13,23.58,229.3,3234,0.153,0.5937,0.6451,0.2756,0.369,0.08815
-9010018,M,15.08,25.74,98,716.6,0.1024,0.09769,0.1235,0.06553,0.1647,0.06464,0.6534,1.506,4.174,63.37,0.01052,0.02431,0.04912,0.01746,0.0212,0.004867,18.51,33.22,121.2,1050,0.166,0.2356,0.4029,0.1526,0.2654,0.09438
-901011,B,11.14,14.07,71.24,384.6,0.07274,0.06064,0.04505,0.01471,0.169,0.06083,0.4222,0.8092,3.33,28.84,0.005541,0.03387,0.04505,0.01471,0.03102,0.004831,12.12,15.82,79.62,453.5,0.08864,0.1256,0.1201,0.03922,0.2576,0.07018
-9010258,B,12.56,19.07,81.92,485.8,0.0876,0.1038,0.103,0.04391,0.1533,0.06184,0.3602,1.478,3.212,27.49,0.009853,0.04235,0.06271,0.01966,0.02639,0.004205,13.37,22.43,89.02,547.4,0.1096,0.2002,0.2388,0.09265,0.2121,0.07188
-9010259,B,13.05,18.59,85.09,512,0.1082,0.1304,0.09603,0.05603,0.2035,0.06501,0.3106,1.51,2.59,21.57,0.007807,0.03932,0.05112,0.01876,0.0286,0.005715,14.19,24.85,94.22,591.2,0.1343,0.2658,0.2573,0.1258,0.3113,0.08317
-901028,B,13.87,16.21,88.52,593.7,0.08743,0.05492,0.01502,0.02088,0.1424,0.05883,0.2543,1.363,1.737,20.74,0.005638,0.007939,0.005254,0.006042,0.01544,0.002087,15.11,25.58,96.74,694.4,0.1153,0.1008,0.05285,0.05556,0.2362,0.07113
-9010333,B,8.878,15.49,56.74,241,0.08293,0.07698,0.04721,0.02381,0.193,0.06621,0.5381,1.2,4.277,30.18,0.01093,0.02899,0.03214,0.01506,0.02837,0.004174,9.981,17.7,65.27,302,0.1015,0.1248,0.09441,0.04762,0.2434,0.07431
-901034301,B,9.436,18.32,59.82,278.6,0.1009,0.05956,0.0271,0.01406,0.1506,0.06959,0.5079,1.247,3.267,30.48,0.006836,0.008982,0.02348,0.006565,0.01942,0.002713,12.02,25.02,75.79,439.6,0.1333,0.1049,0.1144,0.05052,0.2454,0.08136
-901034302,B,12.54,18.07,79.42,491.9,0.07436,0.0265,0.001194,0.005449,0.1528,0.05185,0.3511,0.9527,2.329,28.3,0.005783,0.004693,0.0007929,0.003617,0.02043,0.001058,13.72,20.98,86.82,585.7,0.09293,0.04327,0.003581,0.01635,0.2233,0.05521
-901041,B,13.3,21.57,85.24,546.1,0.08582,0.06373,0.03344,0.02424,0.1815,0.05696,0.2621,1.539,2.028,20.98,0.005498,0.02045,0.01795,0.006399,0.01829,0.001956,14.2,29.2,92.94,621.2,0.114,0.1667,0.1212,0.05614,0.2637,0.06658
-9010598,B,12.76,18.84,81.87,496.6,0.09676,0.07952,0.02688,0.01781,0.1759,0.06183,0.2213,1.285,1.535,17.26,0.005608,0.01646,0.01529,0.009997,0.01909,0.002133,13.75,25.99,87.82,579.7,0.1298,0.1839,0.1255,0.08312,0.2744,0.07238
-9010872,B,16.5,18.29,106.6,838.1,0.09686,0.08468,0.05862,0.04835,0.1495,0.05593,0.3389,1.439,2.344,33.58,0.007257,0.01805,0.01832,0.01033,0.01694,0.002001,18.13,25.45,117.2,1009,0.1338,0.1679,0.1663,0.09123,0.2394,0.06469
-9010877,B,13.4,16.95,85.48,552.4,0.07937,0.05696,0.02181,0.01473,0.165,0.05701,0.1584,0.6124,1.036,13.22,0.004394,0.0125,0.01451,0.005484,0.01291,0.002074,14.73,21.7,93.76,663.5,0.1213,0.1676,0.1364,0.06987,0.2741,0.07582
-901088,M,20.44,21.78,133.8,1293,0.0915,0.1131,0.09799,0.07785,0.1618,0.05557,0.5781,0.9168,4.218,72.44,0.006208,0.01906,0.02375,0.01461,0.01445,0.001906,24.31,26.37,161.2,1780,0.1327,0.2376,0.2702,0.1765,0.2609,0.06735
-9011494,M,20.2,26.83,133.7,1234,0.09905,0.1669,0.1641,0.1265,0.1875,0.0602,0.9761,1.892,7.128,103.6,0.008439,0.04674,0.05904,0.02536,0.0371,0.004286,24.19,33.81,160,1671,0.1278,0.3416,0.3703,0.2152,0.3271,0.07632
-9011495,B,12.21,18.02,78.31,458.4,0.09231,0.07175,0.04392,0.02027,0.1695,0.05916,0.2527,0.7786,1.874,18.57,0.005833,0.01388,0.02,0.007087,0.01938,0.00196,14.29,24.04,93.85,624.6,0.1368,0.217,0.2413,0.08829,0.3218,0.0747
-9011971,M,21.71,17.25,140.9,1546,0.09384,0.08562,0.1168,0.08465,0.1717,0.05054,1.207,1.051,7.733,224.1,0.005568,0.01112,0.02096,0.01197,0.01263,0.001803,30.75,26.44,199.5,3143,0.1363,0.1628,0.2861,0.182,0.251,0.06494
-9012000,M,22.01,21.9,147.2,1482,0.1063,0.1954,0.2448,0.1501,0.1824,0.0614,1.008,0.6999,7.561,130.2,0.003978,0.02821,0.03576,0.01471,0.01518,0.003796,27.66,25.8,195,2227,0.1294,0.3885,0.4756,0.2432,0.2741,0.08574
-9012315,M,16.35,23.29,109,840.4,0.09742,0.1497,0.1811,0.08773,0.2175,0.06218,0.4312,1.022,2.972,45.5,0.005635,0.03917,0.06072,0.01656,0.03197,0.004085,19.38,31.03,129.3,1165,0.1415,0.4665,0.7087,0.2248,0.4824,0.09614
-9012568,B,15.19,13.21,97.65,711.8,0.07963,0.06934,0.03393,0.02657,0.1721,0.05544,0.1783,0.4125,1.338,17.72,0.005012,0.01485,0.01551,0.009155,0.01647,0.001767,16.2,15.73,104.5,819.1,0.1126,0.1737,0.1362,0.08178,0.2487,0.06766
-9012795,M,21.37,15.1,141.3,1386,0.1001,0.1515,0.1932,0.1255,0.1973,0.06183,0.3414,1.309,2.407,39.06,0.004426,0.02675,0.03437,0.01343,0.01675,0.004367,22.69,21.84,152.1,1535,0.1192,0.284,0.4024,0.1966,0.273,0.08666
-901288,M,20.64,17.35,134.8,1335,0.09446,0.1076,0.1527,0.08941,0.1571,0.05478,0.6137,0.6575,4.119,77.02,0.006211,0.01895,0.02681,0.01232,0.01276,0.001711,25.37,23.17,166.8,1946,0.1562,0.3055,0.4159,0.2112,0.2689,0.07055
-9013005,B,13.69,16.07,87.84,579.1,0.08302,0.06374,0.02556,0.02031,0.1872,0.05669,0.1705,0.5066,1.372,14,0.00423,0.01587,0.01169,0.006335,0.01943,0.002177,14.84,20.21,99.16,670.6,0.1105,0.2096,0.1346,0.06987,0.3323,0.07701
-901303,B,16.17,16.07,106.3,788.5,0.0988,0.1438,0.06651,0.05397,0.199,0.06572,0.1745,0.489,1.349,14.91,0.00451,0.01812,0.01951,0.01196,0.01934,0.003696,16.97,19.14,113.1,861.5,0.1235,0.255,0.2114,0.1251,0.3153,0.0896
-901315,B,10.57,20.22,70.15,338.3,0.09073,0.166,0.228,0.05941,0.2188,0.0845,0.1115,1.231,2.363,7.228,0.008499,0.07643,0.1535,0.02919,0.01617,0.0122,10.85,22.82,76.51,351.9,0.1143,0.3619,0.603,0.1465,0.2597,0.12
-9013579,B,13.46,28.21,85.89,562.1,0.07517,0.04726,0.01271,0.01117,0.1421,0.05763,0.1689,1.15,1.4,14.91,0.004942,0.01203,0.007508,0.005179,0.01442,0.001684,14.69,35.63,97.11,680.6,0.1108,0.1457,0.07934,0.05781,0.2694,0.07061
-9013594,B,13.66,15.15,88.27,580.6,0.08268,0.07548,0.04249,0.02471,0.1792,0.05897,0.1402,0.5417,1.101,11.35,0.005212,0.02984,0.02443,0.008356,0.01818,0.004868,14.54,19.64,97.96,657,0.1275,0.3104,0.2569,0.1054,0.3387,0.09638
-9013838,M,11.08,18.83,73.3,361.6,0.1216,0.2154,0.1689,0.06367,0.2196,0.0795,0.2114,1.027,1.719,13.99,0.007405,0.04549,0.04588,0.01339,0.01738,0.004435,13.24,32.82,91.76,508.1,0.2184,0.9379,0.8402,0.2524,0.4154,0.1403
-901549,B,11.27,12.96,73.16,386.3,0.1237,0.1111,0.079,0.0555,0.2018,0.06914,0.2562,0.9858,1.809,16.04,0.006635,0.01777,0.02101,0.01164,0.02108,0.003721,12.84,20.53,84.93,476.1,0.161,0.2429,0.2247,0.1318,0.3343,0.09215
-901836,B,11.04,14.93,70.67,372.7,0.07987,0.07079,0.03546,0.02074,0.2003,0.06246,0.1642,1.031,1.281,11.68,0.005296,0.01903,0.01723,0.00696,0.0188,0.001941,12.09,20.83,79.73,447.1,0.1095,0.1982,0.1553,0.06754,0.3202,0.07287
-90250,B,12.05,22.72,78.75,447.8,0.06935,0.1073,0.07943,0.02978,0.1203,0.06659,0.1194,1.434,1.778,9.549,0.005042,0.0456,0.04305,0.01667,0.0247,0.007358,12.57,28.71,87.36,488.4,0.08799,0.3214,0.2912,0.1092,0.2191,0.09349
-90251,B,12.39,17.48,80.64,462.9,0.1042,0.1297,0.05892,0.0288,0.1779,0.06588,0.2608,0.873,2.117,19.2,0.006715,0.03705,0.04757,0.01051,0.01838,0.006884,14.18,23.13,95.23,600.5,0.1427,0.3593,0.3206,0.09804,0.2819,0.1118
-902727,B,13.28,13.72,85.79,541.8,0.08363,0.08575,0.05077,0.02864,0.1617,0.05594,0.1833,0.5308,1.592,15.26,0.004271,0.02073,0.02828,0.008468,0.01461,0.002613,14.24,17.37,96.59,623.7,0.1166,0.2685,0.2866,0.09173,0.2736,0.0732
-90291,M,14.6,23.29,93.97,664.7,0.08682,0.06636,0.0839,0.05271,0.1627,0.05416,0.4157,1.627,2.914,33.01,0.008312,0.01742,0.03389,0.01576,0.0174,0.002871,15.79,31.71,102.2,758.2,0.1312,0.1581,0.2675,0.1359,0.2477,0.06836
-902975,B,12.21,14.09,78.78,462,0.08108,0.07823,0.06839,0.02534,0.1646,0.06154,0.2666,0.8309,2.097,19.96,0.004405,0.03026,0.04344,0.01087,0.01921,0.004622,13.13,19.29,87.65,529.9,0.1026,0.2431,0.3076,0.0914,0.2677,0.08824
-902976,B,13.88,16.16,88.37,596.6,0.07026,0.04831,0.02045,0.008507,0.1607,0.05474,0.2541,0.6218,1.709,23.12,0.003728,0.01415,0.01988,0.007016,0.01647,0.00197,15.51,19.97,99.66,745.3,0.08484,0.1233,0.1091,0.04537,0.2542,0.06623
-903011,B,11.27,15.5,73.38,392,0.08365,0.1114,0.1007,0.02757,0.181,0.07252,0.3305,1.067,2.569,22.97,0.01038,0.06669,0.09472,0.02047,0.01219,0.01233,12.04,18.93,79.73,450,0.1102,0.2809,0.3021,0.08272,0.2157,0.1043
-90312,M,19.55,23.21,128.9,1174,0.101,0.1318,0.1856,0.1021,0.1989,0.05884,0.6107,2.836,5.383,70.1,0.01124,0.04097,0.07469,0.03441,0.02768,0.00624,20.82,30.44,142,1313,0.1251,0.2414,0.3829,0.1825,0.2576,0.07602
-90317302,B,10.26,12.22,65.75,321.6,0.09996,0.07542,0.01923,0.01968,0.18,0.06569,0.1911,0.5477,1.348,11.88,0.005682,0.01365,0.008496,0.006929,0.01938,0.002371,11.38,15.65,73.23,394.5,0.1343,0.165,0.08615,0.06696,0.2937,0.07722
-903483,B,8.734,16.84,55.27,234.3,0.1039,0.07428,0,0,0.1985,0.07098,0.5169,2.079,3.167,28.85,0.01582,0.01966,0,0,0.01865,0.006736,10.17,22.8,64.01,317,0.146,0.131,0,0,0.2445,0.08865
-903507,M,15.49,19.97,102.4,744.7,0.116,0.1562,0.1891,0.09113,0.1929,0.06744,0.647,1.331,4.675,66.91,0.007269,0.02928,0.04972,0.01639,0.01852,0.004232,21.2,29.41,142.1,1359,0.1681,0.3913,0.5553,0.2121,0.3187,0.1019
-903516,M,21.61,22.28,144.4,1407,0.1167,0.2087,0.281,0.1562,0.2162,0.06606,0.6242,0.9209,4.158,80.99,0.005215,0.03726,0.04718,0.01288,0.02045,0.004028,26.23,28.74,172,2081,0.1502,0.5717,0.7053,0.2422,0.3828,0.1007
-903554,B,12.1,17.72,78.07,446.2,0.1029,0.09758,0.04783,0.03326,0.1937,0.06161,0.2841,1.652,1.869,22.22,0.008146,0.01631,0.01843,0.007513,0.02015,0.001798,13.56,25.8,88.33,559.5,0.1432,0.1773,0.1603,0.06266,0.3049,0.07081
-903811,B,14.06,17.18,89.75,609.1,0.08045,0.05361,0.02681,0.03251,0.1641,0.05764,0.1504,1.685,1.237,12.67,0.005371,0.01273,0.01132,0.009155,0.01719,0.001444,14.92,25.34,96.42,684.5,0.1066,0.1231,0.0846,0.07911,0.2523,0.06609
-90401601,B,13.51,18.89,88.1,558.1,0.1059,0.1147,0.0858,0.05381,0.1806,0.06079,0.2136,1.332,1.513,19.29,0.005442,0.01957,0.03304,0.01367,0.01315,0.002464,14.8,27.2,97.33,675.2,0.1428,0.257,0.3438,0.1453,0.2666,0.07686
-90401602,B,12.8,17.46,83.05,508.3,0.08044,0.08895,0.0739,0.04083,0.1574,0.0575,0.3639,1.265,2.668,30.57,0.005421,0.03477,0.04545,0.01384,0.01869,0.004067,13.74,21.06,90.72,591,0.09534,0.1812,0.1901,0.08296,0.1988,0.07053
-904302,B,11.06,14.83,70.31,378.2,0.07741,0.04768,0.02712,0.007246,0.1535,0.06214,0.1855,0.6881,1.263,12.98,0.004259,0.01469,0.0194,0.004168,0.01191,0.003537,12.68,20.35,80.79,496.7,0.112,0.1879,0.2079,0.05556,0.259,0.09158
-904357,B,11.8,17.26,75.26,431.9,0.09087,0.06232,0.02853,0.01638,0.1847,0.06019,0.3438,1.14,2.225,25.06,0.005463,0.01964,0.02079,0.005398,0.01477,0.003071,13.45,24.49,86,562,0.1244,0.1726,0.1449,0.05356,0.2779,0.08121
-90439701,M,17.91,21.02,124.4,994,0.123,0.2576,0.3189,0.1198,0.2113,0.07115,0.403,0.7747,3.123,41.51,0.007159,0.03718,0.06165,0.01051,0.01591,0.005099,20.8,27.78,149.6,1304,0.1873,0.5917,0.9034,0.1964,0.3245,0.1198
-904647,B,11.93,10.91,76.14,442.7,0.08872,0.05242,0.02606,0.01796,0.1601,0.05541,0.2522,1.045,1.649,18.95,0.006175,0.01204,0.01376,0.005832,0.01096,0.001857,13.8,20.14,87.64,589.5,0.1374,0.1575,0.1514,0.06876,0.246,0.07262
-904689,B,12.96,18.29,84.18,525.2,0.07351,0.07899,0.04057,0.01883,0.1874,0.05899,0.2357,1.299,2.397,20.21,0.003629,0.03713,0.03452,0.01065,0.02632,0.003705,14.13,24.61,96.31,621.9,0.09329,0.2318,0.1604,0.06608,0.3207,0.07247
-9047,B,12.94,16.17,83.18,507.6,0.09879,0.08836,0.03296,0.0239,0.1735,0.062,0.1458,0.905,0.9975,11.36,0.002887,0.01285,0.01613,0.007308,0.0187,0.001972,13.86,23.02,89.69,580.9,0.1172,0.1958,0.181,0.08388,0.3297,0.07834
-904969,B,12.34,14.95,78.29,469.1,0.08682,0.04571,0.02109,0.02054,0.1571,0.05708,0.3833,0.9078,2.602,30.15,0.007702,0.008491,0.01307,0.0103,0.0297,0.001432,13.18,16.85,84.11,533.1,0.1048,0.06744,0.04921,0.04793,0.2298,0.05974
-904971,B,10.94,18.59,70.39,370,0.1004,0.0746,0.04944,0.02932,0.1486,0.06615,0.3796,1.743,3.018,25.78,0.009519,0.02134,0.0199,0.01155,0.02079,0.002701,12.4,25.58,82.76,472.4,0.1363,0.1644,0.1412,0.07887,0.2251,0.07732
-905189,B,16.14,14.86,104.3,800,0.0949

<TRUNCATED>
r***@apache.org
2018-06-28 14:55:05 UTC
Permalink
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/bin/resources/country.txt
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/bin/resources/country.txt b/community/mahout-mr/mr-examples/bin/resources/country.txt
new file mode 100644
index 0000000..6a22091
--- /dev/null
+++ b/community/mahout-mr/mr-examples/bin/resources/country.txt
@@ -0,0 +1,229 @@
+Afghanistan
+Albania
+Algeria
+American Samoa
+Andorra
+Angola
+Anguilla
+Antigua and Barbuda
+Argentina
+Armenia
+Aruba
+Australia
+Austria
+Azerbaijan
+Bahamas
+Bangladesh
+Barbados
+Belarus
+Belgium
+Belize
+Benin
+Bermuda
+Bhutan
+Bolivia
+Bosnia and Herzegovina
+Botswana
+Bouvet Island
+Brazil
+British Indian Ocean Territory
+Brunei Darussalam
+Bulgaria
+Burkina Faso
+Burundi
+Cambodia
+Cameroon
+Canada
+Cape Verde
+Cayman Islands
+Central African Republic
+Chad
+Chile
+China
+Christmas Island
+Cocos Islands
+Colombia
+Comoros
+Congo
+Cook Islands
+Costa Rica
+Croatia
+C�te d'Ivoire
+Cuba
+Cyprus
+Czech Republic
+Djibouti
+Dominica
+Dominican Republic
+Ecuador
+Egypt
+El Salvador
+Equatorial Guinea
+Eritrea
+Estonia
+Ethiopia
+Falkland Islands
+Faroe Islands
+Fiji
+Finland
+France
+French Guiana
+French Polynesia
+French Southern Territories
+Gabon
+Georgia
+Germany
+Ghana
+Gibraltar
+Greece
+Greenland
+Grenada
+Guadeloupe
+Guam
+Guatemala
+Guernsey
+Guinea
+Guinea-Bissau
+Guyana
+Haiti
+Honduras
+Hong Kong
+Hungary
+Iceland
+India
+Indonesia
+Iran
+Iraq
+Ireland
+Isle of Man
+Israel
+Italy
+Japan
+Jersey
+Jordan
+Kazakhstan
+Kenya
+Kiribati
+Korea
+Kuwait
+Kyrgyzstan
+Latvia
+Lebanon
+Lesotho
+Liberia
+Liechtenstein
+Lithuania
+Luxembourg
+Macedonia
+Madagascar
+Malawi
+Malaysia
+Maldives
+Mali
+Malta
+Marshall Islands
+Martinique
+Mauritania
+Mauritius
+Mayotte
+Mexico
+Micronesia
+Moldova
+Monaco
+Mongolia
+Montenegro
+Montserrat
+Morocco
+Mozambique
+Myanmar
+Namibia
+Nauru
+Nepal
+Netherlands
+Netherlands Antilles
+New Caledonia
+New Zealand
+Nicaragua
+Niger
+Nigeria
+Niue
+Norfolk Island
+Northern Mariana Islands
+Norway
+Oman
+Pakistan
+Palau
+Palestinian Territory
+Panama
+Papua New Guinea
+Paraguay
+Peru
+Philippines
+Pitcairn
+Poland
+Portugal
+Puerto Rico
+Qatar
+R�union
+Russian Federation
+Rwanda
+Saint Barth�lemy
+Saint Helena
+Saint Kitts and Nevis
+Saint Lucia
+Saint Martin
+Saint Pierre and Miquelon
+Saint Vincent and the Grenadines
+Samoa
+San Marino
+Sao Tome and Principe
+Saudi Arabia
+Senegal
+Serbia
+Seychelles
+Sierra Leone
+Singapore
+Slovakia
+Slovenia
+Solomon Islands
+Somalia
+South Africa
+South Georgia and the South Sandwich Islands
+Spain
+Sri Lanka
+Sudan
+Suriname
+Svalbard and Jan Mayen
+Swaziland
+Sweden
+Switzerland
+Syrian Arab Republic
+Taiwan
+Tanzania
+Thailand
+Timor-Leste
+Togo
+Tokelau
+Tonga
+Trinidad and Tobago
+Tunisia
+Turkey
+Turkmenistan
+Turks and Caicos Islands
+Tuvalu
+Ukraine
+United Arab Emirates
+United Kingdom
+United States
+United States Minor Outlying Islands
+Uruguay
+Uzbekistan
+Vanuatu
+Vatican
+Venezuela
+Vietnam
+Virgin Islands
+Wallis and Futuna
+Yemen
+Zambia
+Zimbabwe

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/bin/resources/country10.txt
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/bin/resources/country10.txt b/community/mahout-mr/mr-examples/bin/resources/country10.txt
new file mode 100644
index 0000000..97a63e1
--- /dev/null
+++ b/community/mahout-mr/mr-examples/bin/resources/country10.txt
@@ -0,0 +1,10 @@
+Australia
+Austria
+Bahamas
+Canada
+Colombia
+Cuba
+Panama
+Pakistan
+United Kingdom
+Vietnam

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/bin/resources/country2.txt
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/bin/resources/country2.txt b/community/mahout-mr/mr-examples/bin/resources/country2.txt
new file mode 100644
index 0000000..f4b4f61
--- /dev/null
+++ b/community/mahout-mr/mr-examples/bin/resources/country2.txt
@@ -0,0 +1,2 @@
+United States
+United Kingdom

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/bin/resources/donut-test.csv
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/bin/resources/donut-test.csv b/community/mahout-mr/mr-examples/bin/resources/donut-test.csv
new file mode 100644
index 0000000..46ea564
--- /dev/null
+++ b/community/mahout-mr/mr-examples/bin/resources/donut-test.csv
@@ -0,0 +1,41 @@
+"x","y","shape","color","xx","xy","yy","c","a","b"
+0.802415437065065,0.0978854028508067,21,2,0.643870533640319,0.07854475831082,0.00958155209126472,0.503141377562721,0.808363832523192,0.220502180491382
+0.97073650965467,0.989339149091393,23,2,0.942329371176533,0.96038763245370,0.978791951924881,0.67900343471543,1.38604520961670,0.989771844311643
+0.566630310611799,0.369259539060295,25,1,0.321069908904024,0.209233647314105,0.136352607187021,0.146740132271139,0.676330182744379,0.569352171215186
+0.377948862500489,0.500907538458705,24,1,0.142845342665413,0.189317434378387,0.250908362084759,0.122054511555201,0.62749797190921,0.79865886318828
+0.0133881184738129,0.269793515326455,25,2,0.000179241716268851,0.00361202754665705,0.0727885409122062,0.538317888266967,0.270125494221621,1.02283505301727
+0.395229484187439,0.385281964903697,25,1,0.156206345171069,0.152274792255611,0.148442192480054,0.155361155247979,0.551949760078871,0.717070128562224
+0.757145672803745,0.416044564917684,21,1,0.573269569845435,0.315006342020941,0.173093079997545,0.270503996498299,0.863922826323613,0.481737796145881
+0.589166145538911,0.971624446567148,24,2,0.347116747049177,0.572448230095344,0.944054065166917,0.479979395505718,1.13629697360157,1.05491161769044
+0.843438957352191,0.218833807157353,25,2,0.711389274779351,0.184572958142208,0.0478882351549814,0.443852166182378,0.871365313708512,0.269071728782402
+0.628562391968444,0.801476288354024,25,2,0.395090680597092,0.503777852913796,0.642364240793743,0.327744170151609,1.01855531091386,0.8833629703887
+0.262267543468624,0.247060472844169,22,2,0.0687842643570668,0.0647959433010369,0.0610388772419841,0.347124077652729,0.360309785599907,0.778002605819416
+0.738417695043609,0.562460686312988,21,1,0.545260692353516,0.415330923539883,0.316362023647678,0.246463657857698,0.928236347058869,0.620312280963368
+0.498857178725302,0.164454092038795,21,1,0.248858484765768,0.0820391043843046,0.0270451483883046,0.335547854098302,0.525265297877247,0.527436513434051
+0.499293045606464,0.733599063009024,25,1,0.249293545390979,0.366280910423824,0.538167585247717,0.233600132755117,0.88739006679064,0.888186376514393
+0.553942533675581,0.548312899889424,24,1,0.306852330614922,0.303733837011753,0.30064703618515,0.0724150069741539,0.779422457207946,0.706833997094728
+0.661088703200221,0.98143746308051,24,2,0.43703827349895,0.64881721974001,0.963219493937908,0.507672730364875,1.1833248782295,1.03830648704340
+0.492181566543877,0.376017479225993,23,1,0.242242694445585,0.185068871973329,0.141389144683470,0.124228794404457,0.619380205632255,0.63187712891139
+0.991064163157716,0.216620326042175,21,2,0.982208175495505,0.21468464215194,0.0469243656546183,0.566963889458783,1.01446170018888,0.21680455446021
+0.601602173643187,0.343355831922963,24,1,0.361925175332207,0.206563614817919,0.117893227315510,0.186709392055052,0.692689254029335,0.52594111396747
+0.0397100185509771,0.0602901463862509,25,2,0.00157688557331895,0.00239412283143915,0.00363490175127556,0.636562347604197,0.0721927096360464,0.962180726382856
+0.158290433697402,0.630195834673941,23,2,0.0250558614001118,0.0997539719848347,0.397146790040385,0.365672507948237,0.649771230080632,1.05148551299849
+0.967184047214687,0.497705311980098,25,2,0.935444981186582,0.48137263796116,0.247710577573207,0.467189682639721,1.08772954302059,0.498785990511377
+0.538070349488407,0.0130743277259171,24,2,0.289519700998577,0.00703490808881019,0.000170938045484685,0.488411672495383,0.538229169633216,0.462114639529248
+0.758642012253404,0.673675778554752,25,2,0.575537702755893,0.511078748249156,0.453839054611352,0.311542880770993,1.01458206044028,0.715606548922268
+0.986405614530668,0.981674374546856,21,2,0.972996036377624,0.9683291146939,0.96368457764196,0.684544100071034,1.39164672744903,0.981768498658543
+0.51937106740661,0.462004136526957,23,1,0.269746305659081,0.239951581534275,0.213447822168019,0.0426488439882434,0.695121664046734,0.666672328069706
+0.534244359936565,0.692785677267238,21,1,0.28541703612403,0.370116840724856,0.479951994626626,0.195803456422130,0.87485371963012,0.83479357381183
+0.0795328004751354,0.536029864801094,22,2,0.00632546635141770,0.0426319562859392,0.287328015958679,0.422008076977050,0.541898036820671,1.06517035321108
+0.330987347057089,0.804738595616072,23,2,0.10955262391189,0.266358292837412,0.647604207274128,0.348469350894533,0.870147591610767,1.04650950166343
+0.9804020607844,0.74571731640026,25,2,0.961188200790297,0.731102793761427,0.556094315979205,0.539595348001485,1.23178022259229,0.745974795285138
+0.362560331821442,0.805498170899227,21,2,0.131449994210474,0.292041684122788,0.648827303322001,0.334990738397057,0.883333061496328,1.02720817456326
+0.47635925677605,0.961423690896481,21,2,0.226918141516230,0.457983074842334,0.924335513417013,0.462028903057712,1.07296488988841,1.09477629741475
+0.850710266502574,0.635807712096721,24,2,0.723707957532881,0.540888148202193,0.404251446761667,0.376086992190972,1.06205433208219,0.65309943445803
+0.136131341336295,0.714137809583917,25,2,0.0185317420940189,0.0972165379176223,0.509992811077315,0.422203034393551,0.726996941651981,1.12083088398685
+0.930458213202655,0.865616530412808,24,2,0.865752486516278,0.805420010206583,0.749291977723908,0.564774043865972,1.27084399681479,0.868405457050378
+0.374636142514646,0.197784703457728,21,2,0.140352239278254,0.0740972983518064,0.0391187889218614,0.327185241457712,0.423640210792266,0.655895375171089
+0.482126326300204,0.841961156809703,22,1,0.232445794511731,0.405931639420132,0.708898589576332,0.342427950053959,0.970229036922758,0.988479504839456
+0.660344187868759,0.746531683253124,24,2,0.436054446452051,0.492967858096082,0.557309554100743,0.294088642131774,0.996676477375078,0.82016804669243
+0.0772640188224614,0.437956433976069,22,2,0.00596972860459766,0.0338382741581451,0.191805838061035,0.427264688298837,0.444719649515999,1.02139489377063
+0.998469967395067,0.464829172473401,25,2,0.996942275789907,0.464117968683793,0.216066159582307,0.499709210945471,1.10136662168971,0.464831690595724

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/bin/resources/donut.csv
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/bin/resources/donut.csv b/community/mahout-mr/mr-examples/bin/resources/donut.csv
new file mode 100644
index 0000000..33ba3b7
--- /dev/null
+++ b/community/mahout-mr/mr-examples/bin/resources/donut.csv
@@ -0,0 +1,41 @@
+"x","y","shape","color","k","k0","xx","xy","yy","a","b","c","bias"
+0.923307513352484,0.0135197141207755,21,2,4,8,0.852496764213146,0.0124828536260896,0.000182782669907495,0.923406490600458,0.0778750292332978,0.644866125183976,1
+0.711011884035543,0.909141522599384,22,2,3,9,0.505537899239772,0.64641042683833,0.826538308114327,1.15415605849213,0.953966686673604,0.46035073663368,1
+0.75118898646906,0.836567111080512,23,2,3,9,0.564284893392414,0.62842000028592,0.699844531341594,1.12433510339845,0.872783737128441,0.419968245447719,1
+0.308209649519995,0.418023289414123,24,1,5,1,0.094993188057238,0.128838811521522,0.174743470492603,0.519361780024138,0.808280495564412,0.208575453051705,1
+0.849057961953804,0.500220163026825,25,1,5,2,0.720899422757147,0.424715912147755,0.250220211498583,0.985454024425153,0.52249756970547,0.349058031386046,1
+0.0738831346388906,0.486534863477573,21,2,6,1,0.00545871758406844,0.0359467208248278,0.236716173379140,0.492112681164801,1.04613986717142,0.42632955896436,1
+0.612888508243486,0.0204555552918464,22,2,4,10,0.375632323536926,0.0125369747681119,0.000418429742297785,0.613229772009826,0.387651566219268,0.492652707029903,1
+0.207169560948387,0.932857288978994,23,2,1,4,0.0429192269835473,0.193259634985281,0.870222721601238,0.955584610897845,1.22425602987611,0.522604151014326,1
+0.309267645236105,0.506309477845207,24,1,5,1,0.0956464763898851,0.156585139973909,0.256349287355886,0.593292308854389,0.856423069092351,0.190836685845410,1
+0.78758287569508,0.171928803203627,25,2,4,10,0.620286786088131,0.135408181241926,0.0295595133710317,0.806130448165285,0.273277419610556,0.436273561610666,1
+0.930236018029973,0.0790199618786573,21,2,4,8,0.86533904924026,0.0735072146828825,0.00624415437530446,0.93358620577618,0.105409523078414,0.601936228937031,1
+0.238834470743313,0.623727766098455,22,1,5,1,0.0570419044152386,0.148967690904034,0.389036326202168,0.667890882268509,0.984077887735915,0.288991338582386,1
+0.83537525916472,0.802311758277938,23,2,3,7,0.697851823624524,0.670231393002335,0.643704157471036,1.15825557675997,0.819027144096042,0.451518508649315,1
+0.656760312616825,0.320640653371811,24,1,5,3,0.43133410822855,0.210584055746134,0.102810428594702,0.730851925374252,0.469706197095164,0.238209090579297,1
+0.180789119331166,0.114329558331519,25,2,2,5,0.0326847056685386,0.0206695401642766,0.0130712479082803,0.213906413126907,0.82715035810576,0.500636870310341,1
+0.990028728265315,0.061085847672075,21,2,4,8,0.980156882790638,0.0604767440857932,0.00373148078581595,0.991911469626425,0.06189432159595,0.657855445853466,1
+0.751934139290825,0.972332585137337,22,2,3,9,0.565404949831033,0.731130065509666,0.945430656119858,1.22916052895905,1.00347761677540,0.535321288127727,1
+0.136412925552577,0.552212274167687,23,2,6,1,0.0186084862578129,0.0753288918452558,0.304938395741448,0.5688118159807,1.02504684326820,0.3673168690368,1
+0.5729476721026,0.0981996888294816,24,2,4,10,0.328269034967789,0.0562632831160512,0.0096431788862070,0.581302170866406,0.43819729534628,0.408368525870829,1
+0.446335297077894,0.339370004367083,25,1,5,3,0.199215197417612,0.151472811718508,0.115171999864114,0.560702414192882,0.649397107420365,0.169357302283512,1
+0.922843366628513,0.912627586396411,21,2,3,7,0.851639879330248,0.842212314308118,0.832889111451739,1.29789405992245,0.915883320912091,0.590811338548155,1
+0.166969822719693,0.398156099021435,22,2,6,1,0.0278789216990458,0.0664800532683736,0.158528279187967,0.431749002184154,0.923291695753637,0.348254618269284,1
+0.350683249300346,0.84422400011681,23,2,1,6,0.122978741339848,0.296055215498298,0.712714162373228,0.914162405545687,1.06504760696993,0.375214144584023,1
+0.47748578293249,0.792779305484146,24,1,5,6,0.227992672902653,0.378540847371773,0.628499027203925,0.9254683679665,0.949484141121692,0.29364368150863,1
+0.384564548265189,0.153326370986179,25,2,2,5,0.147889891782409,0.0589638865954405,0.0235089760397912,0.414003463538894,0.634247405427742,0.365387395199715,1
+0.563622857443988,0.467359990812838,21,1,5,3,0.317670725433326,0.263414773476928,0.218425361012576,0.73218582781006,0.639414084578942,0.071506910079209,1
+0.343304847599939,0.854578266385943,22,2,1,6,0.117858218385617,0.293380861503846,0.730304013379203,0.920957236664559,1.07775346743350,0.387658506651072,1
+0.666085948701948,0.710089378990233,23,1,5,2,0.443670491058174,0.472980557667886,0.504226926154735,0.973600234805286,0.784681795257806,0.267809801016930,1
+0.190568120684475,0.0772022884339094,24,2,2,5,0.0363162086212125,0.0147122950193909,0.00596019333943254,0.205612261211838,0.813105258002736,0.523933195018469,1
+0.353534662164748,0.427994541125372,25,1,5,1,0.124986757351942,0.151310905505115,0.183179327233118,0.555127088678854,0.775304301713569,0.163208092002022,1
+0.127048352966085,0.927507144864649,21,2,1,4,0.0161412839913949,0.117838255119330,0.860269503774972,0.936168140755905,1.27370093893119,0.567322915045421,1
+0.960906301159412,0.891004979610443,22,2,3,7,0.923340919607862,0.856172299272088,0.793889873690606,1.31043152942016,0.891862204031343,0.604416671286136,1
+0.306814440060407,0.902291874401271,23,2,1,6,0.094135100629581,0.276836176215481,0.81413062661056,0.953029761990747,1.13782109627099,0.446272800849954,1
+0.087350245565176,0.671402548439801,24,2,6,4,0.00763006540029655,0.0586471774793016,0.450781382051459,0.677060889028273,1.13300968942079,0.446831795474291,1
+0.27015240653418,0.371201378758997,25,1,5,1,0.0729823227562089,0.100280945780549,0.137790463592580,0.459099974241765,0.81882108746687,0.263474858488646,1
+0.871842501685023,0.569787061074749,21,2,3,2,0.7601093477444,0.496764576755166,0.324657294968199,1.04152131169391,0.584021951079369,0.378334613738721,1
+0.686449621338397,0.169308491749689,22,2,4,10,0.471213082635629,0.116221750050949,0.0286653653785545,0.707020825728764,0.356341416814533,0.379631841296403,1
+0.67132937326096,0.571220482233912,23,1,5,2,0.450683127402953,0.383477088331915,0.326292839323543,0.881462402332905,0.659027480614106,0.185542747720368,1
+0.548616112209857,0.405350996181369,24,1,5,3,0.300979638576258,0.222382087605415,0.164309430105228,0.682121007359754,0.606676886210257,0.106404700508298,1
+0.677980388281867,0.993355110753328,25,2,3,9,0.459657406894831,0.673475283690318,0.986754376059756,1.20266860895036,1.04424662144096,0.524477152905055,1

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/bin/resources/test-data.csv
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/bin/resources/test-data.csv b/community/mahout-mr/mr-examples/bin/resources/test-data.csv
new file mode 100644
index 0000000..ab683cd
--- /dev/null
+++ b/community/mahout-mr/mr-examples/bin/resources/test-data.csv
@@ -0,0 +1,61 @@
+"V1","V2","V3","V4","V5","V6","V7","V8","y"
+1,-0.212887381184450,-0.955959589855826,-0.00326541907490505,0.0560086232868742,0.091264583618544,0.0172194710825328,-0.0237399208336878,1
+1,3.14702017427074,2.12881054220556,-0.00566925018709358,-0.055626039510634,-0.0630510476335515,-0.00155145331201058,0.108559859662683,0
+1,-2.16541417186635,-2.71847685293678,-0.00833554984263851,0.0433655514274994,-0.102555485096075,-0.156155728366877,-0.0241458595902909,1
+1,-4.33686585982661,-2.6857484867589,-0.0115524101901378,0.122387581992154,0.081766215557828,-0.0206167352421607,-0.0424490760296281,1
+1,2.34100936064648,2.10958510331364,-0.0129315842415535,0.173866353524092,-0.0299915285951044,0.108136400830407,-0.0063355720943443,0
+1,1.30317270786224,3.37038662087804,-0.0230504278644102,-0.131884713919903,0.086455020204179,0.17337860146005,-0.0524355492943794,0
+1,1.94943481762617,3.54806480367192,-0.029538920288902,-0.0720379027720258,0.214306548234308,-0.082665692089578,0.226607475768828,0
+1,3.14635496849369,1.76134258264267,-0.0318247859223975,-0.187198080297378,-0.08576487890296,0.153638925055934,-0.0691201521844938,0
+1,-1.26105438936697,-1.95583819596755,-0.0367826492102569,-0.0936093811581598,-0.0317225362744449,-0.0840334569992295,-0.0627566339884115,1
+1,2.40442001058194,3.23077413487565,-0.0452264569747572,0.0371989606630366,-0.17352653795031,0.102543062447842,-0.0551882772900301,0
+1,-2.20940227045733,-0.175769402031962,-0.0465958462590872,0.130789407148096,-0.140283147466875,0.0708851428212228,0.0605244763586474,1
+1,-1.64710385829030,-2.57691366099069,-0.0553070134425288,-0.0349011715152424,-0.0826092377112715,0.106766133325393,-0.0585587032435851,1
+1,-2.6523724984616,-4.16903830585265,-0.0568310036349303,-0.0291979248790545,-0.255996825268056,0.0401827924643623,0.0179311252387879,1
+1,2.34337447158977,0.28996735916551,-0.0625800583342644,0.0899232083837452,0.0255207970332586,-0.0343458209061299,0.0755898049986344,0
+1,3.67556867120403,1.36097809464341,-0.0956707962851342,0.0537771695881714,-0.0373171704803031,0.0463473815328367,-0.228499359561800,0
+1,1.96533061882493,2.92646586187099,-0.103334098736041,-0.0194013528907574,0.0253359438067293,0.00748464018133427,-0.239745502177878,0
+1,-1.95041601303593,-0.860607985906108,-0.103721968898869,-0.00972933741506002,0.0227857854969761,-0.0287381002832544,-0.130156656165122,1
+1,-1.51543545229533,-1.35683836829949,-0.106483722717291,0.103877046729912,0.00840497101030744,0.0258430051020969,0.168907472637671,1
+1,1.45074382041585,1.88231080047069,-0.107681637419817,-0.00626324733854461,-0.144385489192821,0.00088239451623517,-0.00299885969569744,0
+1,3.87956616310254,4.31276421460554,-0.129963535661731,-0.0640782960295875,-0.0324909886960640,0.0428280701443882,0.0329254937199428,0
+1,-2.88187391546093,-3.16731558128991,-0.136390769151814,-0.155408895734766,0.105626409419800,-0.0918345772196075,0.197828194781600,1
+1,-2.65024496288248,-1.81147577507541,-0.145438998990911,0.0691687502404964,0.0749439097959056,-0.0674149410216342,0.123896965825847,1
+1,-1.37426198993006,-2.08894064826135,-0.153236566384176,0.0213513951854753,-0.134553043562400,0.00287304090325258,0.0122158739075685,1
+1,1.65698424179346,2.49004336804714,-0.153862461770005,0.105220938080375,-0.0946233303225818,-0.122426312548592,-0.00538234276442917,0
+1,2.93315586503758,2.75229115279104,-0.168877592929163,-0.0349207806558679,0.0189964813847077,0.202397029441612,0.0426299706123943,0
+1,-3.84306960373604,-2.35606387141237,-0.179511886850707,-0.0916819865200809,0.0265829433229566,0.101658708455140,-0.0855390303406673,1
+1,2.28101644492271,1.37963780647481,-0.180898801743387,-0.0789829066843624,-0.0779025366072777,0.0442621459868237,-0.136195159617836,0
+1,1.70008372335953,2.71018350574622,-0.188985514267118,-0.195856534813112,-0.106263419324547,-0.0311178988395261,-0.121173036989233,0
+1,-2.05613043162767,-1.73770126734937,0.00630625444849072,-0.134595964087825,0.0708994966210059,0.0739139562742148,-0.00416084523004362,1
+1,2.39375626983328,3.2468518382106,0.00951905535238045,-0.140380515724865,0.0630970962358967,0.00183192220061040,-0.0773483294293499,0
+1,4.26863682432937,3.49421800345979,0.0109175198048448,-0.109995560295421,-0.111585866731122,0.154763193427948,-0.0186987535307691,0
+1,1.54495296452702,3.17243560853872,0.0117478311845783,0.115838636637105,-0.1715332868224,0.0927292648278796,-0.0885962242970987,0
+1,2.16883227993245,1.63879588167162,0.0158863105366749,-0.00488771308802354,0.0280782748001184,0.131946735985038,0.066416828384239,0
+1,1.86427271422921,3.32026821853873,0.0162473257475520,0.0355005599857545,-0.0988825269654524,0.0527023072810735,0.100841323212596,0
+1,-3.03828333997027,-1.43214405751321,0.0247204684728272,0.146197859364444,0.0141171187314724,-0.201738256450160,0.044002672456105,1
+1,2.08595761680696,0.225336429607513,0.0335964287149376,0.0576493862055925,0.121452048491972,0.0640240734436852,0.224720096669846,0
+1,-1.85256114614442,-2.22817393781734,0.0346230650580488,0.160185441442375,0.0114059982858295,0.00496408500928602,-0.094156048483371,1
+1,2.33572915427688,1.03334367238243,0.0357824515834720,-0.172284120406131,0.0329286256184980,-0.101030665525296,-0.00238851979619332,0
+1,-2.00334039609229,-2.98875026257892,0.0375804284421083,0.142856636546252,-0.0862220203147005,-0.0441603903572752,0.0147126239348866,1
+1,2.38346139581192,1.21051372282823,0.0405425233313353,-0.145245065311593,-0.0216697981922324,-0.0128934036902430,-0.0325085994141851,0
+1,-1.15629168023471,-1.37784639006639,0.0429948703549178,-0.00491267793152886,0.0263522850749959,-0.0442602193050815,0.0582704866256344,1
+1,2.13230915550664,1.32833684701498,0.0434112538719301,-0.0296522957829338,0.00247091583877657,-0.123872403365319,-0.136549696313901,0
+1,-1.88291252343724,-1.99980946454726,0.0472833199907535,-0.0365284873908706,-0.0209054390489622,-0.0891896486647233,0.0542966824787834,1
+1,-1.34787394136153,-2.57763619051754,0.0493154843443071,0.0384664637019124,-0.00780509859650452,-0.118550134827935,0.00573215142098708,1
+1,-1.81748193199251,-2.72113041015796,0.0551479875680516,-0.255723061179778,-0.217672946803948,0.145106553357089,0.0632886151091758,1
+1,-3.13049595715861,-0.0285946551309455,0.0724437318718333,-0.0360911974267016,-0.121364676014540,0.038351368519738,-0.0125375424386282,1
+1,-2.3836883021805,-1.40162632998805,0.0746620557343183,0.069222624188286,0.04657285528431,0.0932835769596473,0.00836816351062604,1
+1,-2.43800450243598,-0.965440038635416,0.0763675021411913,-0.122575769653323,0.045866930905471,-0.0493852614669876,0.128116802512532,1
+1,1.09024638837653,2.21814920469686,0.0769910502309598,-0.270152593833931,-0.252735856082821,0.0661674666715274,-0.000429289775969046,0
+1,3.17642151475607,1.18015379683312,0.0776648965451875,-0.117234850817615,0.0759455286430382,0.119280079276134,0.117056969569811,0
+1,-3.5501372839931,-4.02435741321994,0.0833451415432366,-0.0185864612285970,0.0553371588028254,0.0269699189958747,-0.0930023774668385,1
+1,-2.85922019599943,-2.07644295605507,0.0903467736346066,0.124804691516462,0.0673015037344841,0.0234043567104492,0.0866115903248345,1
+1,0.513249476607372,5.0165612245778,0.0934321220365115,-0.0387550539552360,0.070129320868753,0.0635055975927393,-0.00773489793089484,0
+1,1.30094323285406,2.74698316868320,0.094239413405751,-0.105600040230387,-0.0134676903839459,0.00834379403909127,0.0978349326557826,0
+1,1.62511731278249,3.01296963021698,0.104352029985773,-0.0065839083200722,0.068460830526483,-0.1202220553,0.121998460927858,0
+1,1.82917662184333,2.89388269168932,0.110781239485760,-0.262387884050666,-0.00517657837760664,-0.0224028641246511,-0.108606003593092,0
+1,-3.17279743572930,-2.86698187406046,0.110873139279243,-0.093614374710967,0.0925974010859032,-0.00747619041107016,-0.066394213442664,1
+1,-3.20104938765970,-1.68043245593876,0.123227179211642,-0.00179275501686146,-0.175893752209014,-0.0835732816974749,0.0560957582079696,1
+1,-1.89923900052239,-2.92427973445236,0.147975477003611,0.00819675018680998,0.00470753628896422,-0.0122227288860826,0.209903875101594,1
+1,0.148491843864120,-1.54734877494689,0.162479731968606,0.112962938668545,-0.0100535803565242,0.0422099301034027,0.0752974779385111,1

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/bin/set-dfs-commands.sh
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/bin/set-dfs-commands.sh b/community/mahout-mr/mr-examples/bin/set-dfs-commands.sh
new file mode 100755
index 0000000..0ee5fe1
--- /dev/null
+++ b/community/mahout-mr/mr-examples/bin/set-dfs-commands.sh
@@ -0,0 +1,54 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#
+# Requires $HADOOP_HOME to be set.
+#
+# Figures out the major version of Hadoop we're using and sets commands
+# for dfs commands
+#
+# Run by each example script.
+
+# Find a hadoop shell
+if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
+ HADOOP="${HADOOP_HOME}/bin/hadoop"
+ if [ ! -e $HADOOP ]; then
+ echo "Can't find hadoop in $HADOOP, exiting"
+ exit 1
+ fi
+fi
+
+# Check Hadoop version
+v=`${HADOOP_HOME}/bin/hadoop version | egrep "Hadoop [0-9]+.[0-9]+.[0-9]+" | cut -f 2 -d ' ' | cut -f 1 -d '.'`
+
+if [ $v -eq "1" -o $v -eq "0" ]
+then
+ echo "Discovered Hadoop v0 or v1."
+ export DFS="${HADOOP_HOME}/bin/hadoop dfs"
+ export DFSRM="$DFS -rmr -skipTrash"
+elif [ $v -eq "2" ]
+then
+ echo "Discovered Hadoop v2."
+ export DFS="${HADOOP_HOME}/bin/hdfs dfs"
+ export DFSRM="$DFS -rm -r -skipTrash"
+else
+ echo "Can't determine Hadoop version."
+ exit 1
+fi
+echo "Setting dfs command to $DFS, dfs rm to $DFSRM."
+
+export HVERSION=$v

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/pom.xml
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/pom.xml b/community/mahout-mr/mr-examples/pom.xml
new file mode 100644
index 0000000..7627f23
--- /dev/null
+++ b/community/mahout-mr/mr-examples/pom.xml
@@ -0,0 +1,121 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.mahout</groupId>
+ <artifactId>mahout-mr</artifactId>
+ <version>0.14.0-SNAPSHOT</version>
+ <relativePath>../pom.xml</relativePath>
+ </parent>
+
+ <artifactId>mr-examples</artifactId>
+ <name>-- Mahout Classic: Examples</name>
+ <description>Scalable machine learning library examples</description>
+
+ <packaging>jar</packaging>
+ <properties>
+ <mahout.skip.example>false</mahout.skip.example>
+ </properties>
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-dependency-plugin</artifactId>
+ <executions>
+ <execution>
+ <id>copy-dependencies</id>
+ <phase>package</phase>
+ <goals>
+ <goal>copy-dependencies</goal>
+ </goals>
+ <configuration>
+ <!-- configure the plugin here -->
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+
+ <!-- create examples hadoop job jar -->
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-assembly-plugin</artifactId>
+ <executions>
+ <execution>
+ <id>job</id>
+ <phase>package</phase>
+ <goals>
+ <goal>single</goal>
+ </goals>
+ <configuration>
+ <skipAssembly>${mahout.skip.example}</skipAssembly>
+ <descriptors>
+ <descriptor>src/main/assembly/job.xml</descriptor>
+ </descriptors>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+
+
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-remote-resources-plugin</artifactId>
+ <configuration>
+ <appendedResourcesDirectory>../mr/src/main/appended-resources</appendedResourcesDirectory>
+ <resourceBundles>
+ <resourceBundle>org.apache:apache-jar-resource-bundle:1.4</resourceBundle>
+ </resourceBundles>
+ <supplementalModels>
+ <supplementalModel>supplemental-models.xml</supplementalModel>
+ </supplementalModels>
+ </configuration>
+ </plugin>
+
+ <plugin>
+ <artifactId>maven-source-plugin</artifactId>
+ </plugin>
+
+ <plugin>
+ <groupId>org.mortbay.jetty</groupId>
+ <artifactId>maven-jetty-plugin</artifactId>
+ <version>6.1.26</version>
+ </plugin>
+ </plugins>
+
+ </build>
+
+ <dependencies>
+
+
+
+
+ </dependencies>
+
+ <profiles>
+ <profile>
+ <id>release.prepare</id>
+ <properties>
+ <mahout.skip.example>true</mahout.skip.example>
+ </properties>
+ </profile>
+ </profiles>
+</project>

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/assembly/job.xml
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/assembly/job.xml b/community/mahout-mr/mr-examples/src/main/assembly/job.xml
new file mode 100644
index 0000000..0c41f3d
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/assembly/job.xml
@@ -0,0 +1,46 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<assembly
+ xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0"
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0
+ http://maven.apache.org/xsd/assembly-1.1.0.xsd">
+ <id>job</id>
+ <formats>
+ <format>jar</format>
+ </formats>
+ <includeBaseDirectory>false</includeBaseDirectory>
+ <dependencySets>
+ <dependencySet>
+ <unpack>true</unpack>
+ <unpackOptions>
+ <!-- MAHOUT-1126 -->
+ <excludes>
+ <exclude>META-INF/LICENSE</exclude>
+ </excludes>
+ </unpackOptions>
+ <scope>runtime</scope>
+ <outputDirectory>/</outputDirectory>
+ <useTransitiveFiltering>true</useTransitiveFiltering>
+ <excludes>
+ <exclude>org.apache.hadoop:hadoop-core</exclude>
+ </excludes>
+ </dependencySet>
+ </dependencySets>
+</assembly>
+
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/TasteOptionParser.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/TasteOptionParser.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/TasteOptionParser.java
new file mode 100644
index 0000000..6392b9f
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/TasteOptionParser.java
@@ -0,0 +1,75 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example;
+
+import java.io.File;
+
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.OptionException;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.commandline.Parser;
+import org.apache.mahout.common.CommandLineUtil;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
+
+/**
+ * This class provides a common implementation for parsing input parameters for
+ * all taste examples. Currently they only need the path to the recommendations
+ * file as input.
+ *
+ * The class is safe to be used in threaded contexts.
+ */
+public final class TasteOptionParser {
+
+ private TasteOptionParser() {
+ }
+
+ /**
+ * Parse the given command line arguments.
+ * @param args the arguments as given to the application.
+ * @return the input file if a file was given on the command line, null otherwise.
+ */
+ public static File getRatings(String[] args) throws OptionException {
+ DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
+ ArgumentBuilder abuilder = new ArgumentBuilder();
+ GroupBuilder gbuilder = new GroupBuilder();
+
+ Option inputOpt = obuilder.withLongName("input").withRequired(false).withShortName("i")
+ .withArgument(abuilder.withName("input").withMinimum(1).withMaximum(1).create())
+ .withDescription("The Path for input data directory.").create();
+
+ Option helpOpt = DefaultOptionCreator.helpOption();
+
+ Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(helpOpt).create();
+
+ Parser parser = new Parser();
+ parser.setGroup(group);
+ CommandLine cmdLine = parser.parse(args);
+
+ if (cmdLine.hasOption(helpOpt)) {
+ CommandLineUtil.printHelp(group);
+ return null;
+ }
+
+ return cmdLine.hasOption(inputOpt) ? new File(cmdLine.getValue(inputOpt).toString()) : null;
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingBooleanRecommender.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingBooleanRecommender.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingBooleanRecommender.java
new file mode 100644
index 0000000..c908e5b
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingBooleanRecommender.java
@@ -0,0 +1,102 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.bookcrossing;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.neighborhood.NearestNUserNeighborhood;
+import org.apache.mahout.cf.taste.impl.recommender.GenericBooleanPrefUserBasedRecommender;
+import org.apache.mahout.cf.taste.impl.similarity.CachingUserSimilarity;
+import org.apache.mahout.cf.taste.impl.similarity.LogLikelihoodSimilarity;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.neighborhood.UserNeighborhood;
+import org.apache.mahout.cf.taste.recommender.IDRescorer;
+import org.apache.mahout.cf.taste.recommender.RecommendedItem;
+import org.apache.mahout.cf.taste.recommender.Recommender;
+import org.apache.mahout.cf.taste.similarity.UserSimilarity;
+
+import java.util.Collection;
+import java.util.List;
+
+/**
+ * A simple {@link Recommender} implemented for the Book Crossing demo.
+ * See the <a href="http://www.informatik.uni-freiburg.de/~cziegler/BX/">Book Crossing site</a>.
+ */
+public final class BookCrossingBooleanRecommender implements Recommender {
+
+ private final Recommender recommender;
+
+ public BookCrossingBooleanRecommender(DataModel bcModel) throws TasteException {
+ UserSimilarity similarity = new CachingUserSimilarity(new LogLikelihoodSimilarity(bcModel), bcModel);
+ UserNeighborhood neighborhood =
+ new NearestNUserNeighborhood(10, Double.NEGATIVE_INFINITY, similarity, bcModel, 1.0);
+ recommender = new GenericBooleanPrefUserBasedRecommender(bcModel, neighborhood, similarity);
+ }
+
+ @Override
+ public List<RecommendedItem> recommend(long userID, int howMany) throws TasteException {
+ return recommender.recommend(userID, howMany);
+ }
+
+ @Override
+ public List<RecommendedItem> recommend(long userID, int howMany, boolean includeKnownItems) throws TasteException {
+ return recommend(userID, howMany, null, includeKnownItems);
+ }
+
+ @Override
+ public List<RecommendedItem> recommend(long userID, int howMany, IDRescorer rescorer) throws TasteException {
+ return recommender.recommend(userID, howMany, rescorer, false);
+ }
+
+ @Override
+ public List<RecommendedItem> recommend(long userID, int howMany, IDRescorer rescorer, boolean includeKnownItems)
+ throws TasteException {
+ return recommender.recommend(userID, howMany, rescorer, includeKnownItems);
+ }
+
+ @Override
+ public float estimatePreference(long userID, long itemID) throws TasteException {
+ return recommender.estimatePreference(userID, itemID);
+ }
+
+ @Override
+ public void setPreference(long userID, long itemID, float value) throws TasteException {
+ recommender.setPreference(userID, itemID, value);
+ }
+
+ @Override
+ public void removePreference(long userID, long itemID) throws TasteException {
+ recommender.removePreference(userID, itemID);
+ }
+
+ @Override
+ public DataModel getDataModel() {
+ return recommender.getDataModel();
+ }
+
+ @Override
+ public void refresh(Collection<Refreshable> alreadyRefreshed) {
+ recommender.refresh(alreadyRefreshed);
+ }
+
+ @Override
+ public String toString() {
+ return "BookCrossingBooleanRecommender[recommender:" + recommender + ']';
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingBooleanRecommenderBuilder.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingBooleanRecommenderBuilder.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingBooleanRecommenderBuilder.java
new file mode 100644
index 0000000..2219bce
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingBooleanRecommenderBuilder.java
@@ -0,0 +1,32 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.bookcrossing;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.eval.RecommenderBuilder;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.recommender.Recommender;
+
+final class BookCrossingBooleanRecommenderBuilder implements RecommenderBuilder {
+
+ @Override
+ public Recommender buildRecommender(DataModel dataModel) throws TasteException {
+ return new BookCrossingBooleanRecommender(dataModel);
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingBooleanRecommenderEvaluatorRunner.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingBooleanRecommenderEvaluatorRunner.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingBooleanRecommenderEvaluatorRunner.java
new file mode 100644
index 0000000..b9814c7
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingBooleanRecommenderEvaluatorRunner.java
@@ -0,0 +1,59 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.bookcrossing;
+
+import org.apache.commons.cli2.OptionException;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.eval.IRStatistics;
+import org.apache.mahout.cf.taste.eval.RecommenderIRStatsEvaluator;
+import org.apache.mahout.cf.taste.example.TasteOptionParser;
+import org.apache.mahout.cf.taste.impl.eval.GenericRecommenderIRStatsEvaluator;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.File;
+import java.io.IOException;
+
+public final class BookCrossingBooleanRecommenderEvaluatorRunner {
+
+ private static final Logger log = LoggerFactory.getLogger(BookCrossingBooleanRecommenderEvaluatorRunner.class);
+
+ private BookCrossingBooleanRecommenderEvaluatorRunner() {
+ // do nothing
+ }
+
+ public static void main(String... args) throws IOException, TasteException, OptionException {
+ RecommenderIRStatsEvaluator evaluator = new GenericRecommenderIRStatsEvaluator();
+ File ratingsFile = TasteOptionParser.getRatings(args);
+ DataModel model =
+ ratingsFile == null ? new BookCrossingDataModel(true) : new BookCrossingDataModel(ratingsFile, true);
+
+ IRStatistics evaluation = evaluator.evaluate(
+ new BookCrossingBooleanRecommenderBuilder(),
+ new BookCrossingDataModelBuilder(),
+ model,
+ null,
+ 3,
+ Double.NEGATIVE_INFINITY,
+ 1.0);
+
+ log.info(String.valueOf(evaluation));
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingDataModel.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingDataModel.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingDataModel.java
new file mode 100644
index 0000000..3e2f8b5
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingDataModel.java
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.bookcrossing;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.util.regex.Pattern;
+
+import com.google.common.base.Charsets;
+import com.google.common.io.Closeables;
+import org.apache.mahout.cf.taste.similarity.precompute.example.GroupLensDataModel;
+import org.apache.mahout.cf.taste.impl.model.file.FileDataModel;
+import org.apache.mahout.common.iterator.FileLineIterable;
+
+/**
+ * See <a href="http://www.informatik.uni-freiburg.de/~cziegler/BX/BX-CSV-Dump.zip">download</a> for
+ * data needed by this class. The BX-Book-Ratings.csv file is needed.
+ */
+public final class BookCrossingDataModel extends FileDataModel {
+
+ private static final Pattern NON_DIGIT_SEMICOLON_PATTERN = Pattern.compile("[^0-9;]");
+
+ public BookCrossingDataModel(boolean ignoreRatings) throws IOException {
+ this(GroupLensDataModel.readResourceToTempFile(
+ "/org/apache/mahout/cf/taste/example/bookcrossing/BX-Book-Ratings.csv"),
+ ignoreRatings);
+ }
+
+ /**
+ * @param ratingsFile BookCrossing ratings file in its native format
+ * @throws IOException if an error occurs while reading or writing files
+ */
+ public BookCrossingDataModel(File ratingsFile, boolean ignoreRatings) throws IOException {
+ super(convertBCFile(ratingsFile, ignoreRatings));
+ }
+
+ private static File convertBCFile(File originalFile, boolean ignoreRatings) throws IOException {
+ if (!originalFile.exists()) {
+ throw new FileNotFoundException(originalFile.toString());
+ }
+ File resultFile = new File(new File(System.getProperty("java.io.tmpdir")), "taste.bookcrossing.txt");
+ resultFile.delete();
+ Writer writer = null;
+ try {
+ writer = new OutputStreamWriter(new FileOutputStream(resultFile), Charsets.UTF_8);
+ for (String line : new FileLineIterable(originalFile, true)) {
+ // 0 ratings are basically "no rating", ignore them (thanks h.9000)
+ if (line.endsWith("\"0\"")) {
+ continue;
+ }
+ // Delete replace anything that isn't numeric, or a semicolon delimiter. Make comma the delimiter.
+ String convertedLine = NON_DIGIT_SEMICOLON_PATTERN.matcher(line)
+ .replaceAll("").replace(';', ',');
+ // If this means we deleted an entire ID -- few cases like that -- skip the line
+ if (convertedLine.contains(",,")) {
+ continue;
+ }
+ if (ignoreRatings) {
+ // drop rating
+ convertedLine = convertedLine.substring(0, convertedLine.lastIndexOf(','));
+ }
+ writer.write(convertedLine);
+ writer.write('\n');
+ }
+ writer.flush();
+ } catch (IOException ioe) {
+ resultFile.delete();
+ throw ioe;
+ } finally {
+ Closeables.close(writer, false);
+ }
+ return resultFile;
+ }
+
+ @Override
+ public String toString() {
+ return "BookCrossingDataModel";
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingDataModelBuilder.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingDataModelBuilder.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingDataModelBuilder.java
new file mode 100644
index 0000000..9ec2eaf
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingDataModelBuilder.java
@@ -0,0 +1,33 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.bookcrossing;
+
+import org.apache.mahout.cf.taste.eval.DataModelBuilder;
+import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
+import org.apache.mahout.cf.taste.impl.model.GenericBooleanPrefDataModel;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+
+final class BookCrossingDataModelBuilder implements DataModelBuilder {
+
+ @Override
+ public DataModel buildDataModel(FastByIDMap<PreferenceArray> trainingData) {
+ return new GenericBooleanPrefDataModel(GenericBooleanPrefDataModel.toDataMap(trainingData));
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingRecommender.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingRecommender.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingRecommender.java
new file mode 100644
index 0000000..c06ca2f
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingRecommender.java
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.bookcrossing;
+
+import java.util.Collection;
+import java.util.List;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.neighborhood.NearestNUserNeighborhood;
+import org.apache.mahout.cf.taste.impl.recommender.GenericUserBasedRecommender;
+import org.apache.mahout.cf.taste.impl.similarity.CachingUserSimilarity;
+import org.apache.mahout.cf.taste.impl.similarity.EuclideanDistanceSimilarity;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.neighborhood.UserNeighborhood;
+import org.apache.mahout.cf.taste.recommender.IDRescorer;
+import org.apache.mahout.cf.taste.recommender.RecommendedItem;
+import org.apache.mahout.cf.taste.recommender.Recommender;
+import org.apache.mahout.cf.taste.similarity.UserSimilarity;
+
+/**
+ * A simple {@link Recommender} implemented for the Book Crossing demo.
+ * See the <a href="http://www.informatik.uni-freiburg.de/~cziegler/BX/">Book Crossing site</a>.
+ */
+public final class BookCrossingRecommender implements Recommender {
+
+ private final Recommender recommender;
+
+ public BookCrossingRecommender(DataModel bcModel) throws TasteException {
+ UserSimilarity similarity = new CachingUserSimilarity(new EuclideanDistanceSimilarity(bcModel), bcModel);
+ UserNeighborhood neighborhood = new NearestNUserNeighborhood(10, 0.2, similarity, bcModel, 0.2);
+ recommender = new GenericUserBasedRecommender(bcModel, neighborhood, similarity);
+ }
+
+ @Override
+ public List<RecommendedItem> recommend(long userID, int howMany) throws TasteException {
+ return recommender.recommend(userID, howMany);
+ }
+
+ @Override
+ public List<RecommendedItem> recommend(long userID, int howMany, boolean includeKnownItems) throws TasteException {
+ return recommend(userID, howMany, null, includeKnownItems);
+ }
+
+ @Override
+ public List<RecommendedItem> recommend(long userID, int howMany, IDRescorer rescorer) throws TasteException {
+ return recommender.recommend(userID, howMany, rescorer, false);
+ }
+
+ @Override
+ public List<RecommendedItem> recommend(long userID, int howMany, IDRescorer rescorer, boolean includeKnownItems)
+ throws TasteException {
+ return recommender.recommend(userID, howMany, rescorer, false);
+ }
+
+ @Override
+ public float estimatePreference(long userID, long itemID) throws TasteException {
+ return recommender.estimatePreference(userID, itemID);
+ }
+
+ @Override
+ public void setPreference(long userID, long itemID, float value) throws TasteException {
+ recommender.setPreference(userID, itemID, value);
+ }
+
+ @Override
+ public void removePreference(long userID, long itemID) throws TasteException {
+ recommender.removePreference(userID, itemID);
+ }
+
+ @Override
+ public DataModel getDataModel() {
+ return recommender.getDataModel();
+ }
+
+ @Override
+ public void refresh(Collection<Refreshable> alreadyRefreshed) {
+ recommender.refresh(alreadyRefreshed);
+ }
+
+ @Override
+ public String toString() {
+ return "BookCrossingRecommender[recommender:" + recommender + ']';
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingRecommenderBuilder.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingRecommenderBuilder.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingRecommenderBuilder.java
new file mode 100644
index 0000000..bb6d3e1
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingRecommenderBuilder.java
@@ -0,0 +1,32 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.bookcrossing;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.eval.RecommenderBuilder;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.recommender.Recommender;
+
+final class BookCrossingRecommenderBuilder implements RecommenderBuilder {
+
+ @Override
+ public Recommender buildRecommender(DataModel dataModel) throws TasteException {
+ return new BookCrossingRecommender(dataModel);
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingRecommenderEvaluatorRunner.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingRecommenderEvaluatorRunner.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingRecommenderEvaluatorRunner.java
new file mode 100644
index 0000000..97074d2
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingRecommenderEvaluatorRunner.java
@@ -0,0 +1,54 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.bookcrossing;
+
+import java.io.File;
+import java.io.IOException;
+
+import org.apache.commons.cli2.OptionException;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.eval.RecommenderEvaluator;
+import org.apache.mahout.cf.taste.example.TasteOptionParser;
+import org.apache.mahout.cf.taste.impl.eval.AverageAbsoluteDifferenceRecommenderEvaluator;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public final class BookCrossingRecommenderEvaluatorRunner {
+
+ private static final Logger log = LoggerFactory.getLogger(BookCrossingRecommenderEvaluatorRunner.class);
+
+ private BookCrossingRecommenderEvaluatorRunner() {
+ // do nothing
+ }
+
+ public static void main(String... args) throws IOException, TasteException, OptionException {
+ RecommenderEvaluator evaluator = new AverageAbsoluteDifferenceRecommenderEvaluator();
+ File ratingsFile = TasteOptionParser.getRatings(args);
+ DataModel model =
+ ratingsFile == null ? new BookCrossingDataModel(false) : new BookCrossingDataModel(ratingsFile, false);
+
+ double evaluation = evaluator.evaluate(new BookCrossingRecommenderBuilder(),
+ null,
+ model,
+ 0.9,
+ 0.3);
+ log.info(String.valueOf(evaluation));
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/README
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/README b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/README
new file mode 100644
index 0000000..9244fe3
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/README
@@ -0,0 +1,9 @@
+Code works with BookCrossing data set, which is not included in this distribution but is downloadable from
+http://www.informatik.uni-freiburg.de/~cziegler/BX/
+
+Data set originated from:
+
+Improving Recommendation Lists Through Topic Diversification,
+ Cai-Nicolas Ziegler, Sean M. McNee, Joseph A. Konstan, Georg Lausen;
+ Proceedings of the 14th International World Wide Web Conference (WWW '05), May 10-14, 2005, Chiba, Japan.
+ To appear.
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/email/EmailUtility.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/email/EmailUtility.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/email/EmailUtility.java
new file mode 100644
index 0000000..033daa2
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/email/EmailUtility.java
@@ -0,0 +1,104 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.email;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.Writable;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable;
+import org.apache.mahout.math.map.OpenObjectIntHashMap;
+
+import java.io.IOException;
+import java.util.regex.Pattern;
+
+public final class EmailUtility {
+
+ public static final String SEPARATOR = "separator";
+ public static final String MSG_IDS_PREFIX = "msgIdsPrefix";
+ public static final String FROM_PREFIX = "fromPrefix";
+ public static final String MSG_ID_DIMENSION = "msgIdDim";
+ public static final String FROM_INDEX = "fromIdx";
+ public static final String REFS_INDEX = "refsIdx";
+ private static final String[] EMPTY = new String[0];
+ private static final Pattern ADDRESS_CLEANUP = Pattern.compile("mailto:|<|>|\\[|\\]|\\=20");
+ private static final Pattern ANGLE_BRACES = Pattern.compile("<|>");
+ private static final Pattern SPACE_OR_CLOSE_ANGLE = Pattern.compile(">|\\s+");
+ public static final Pattern WHITESPACE = Pattern.compile("\\s*");
+
+ private EmailUtility() {
+ }
+
+ /**
+ * Strip off some spurious characters that make it harder to dedup
+ */
+ public static String cleanUpEmailAddress(CharSequence address) {
+ //do some cleanup to normalize some things, like: Key: karthik ananth <***@gmail.com>: Value: 178
+ //Key: karthik ananth [mailto:***@gmail.com]=20: Value: 179
+ //TODO: is there more to clean up here?
+ return ADDRESS_CLEANUP.matcher(address).replaceAll("");
+ }
+
+ public static void loadDictionaries(Configuration conf, String fromPrefix,
+ OpenObjectIntHashMap<String> fromDictionary,
+ String msgIdPrefix,
+ OpenObjectIntHashMap<String> msgIdDictionary) throws IOException {
+
+ Path[] localFiles = HadoopUtil.getCachedFiles(conf);
+ FileSystem fs = FileSystem.getLocal(conf);
+ for (Path dictionaryFile : localFiles) {
+
+ // key is word value is id
+
+ OpenObjectIntHashMap<String> dictionary = null;
+ if (dictionaryFile.getName().startsWith(fromPrefix)) {
+ dictionary = fromDictionary;
+ } else if (dictionaryFile.getName().startsWith(msgIdPrefix)) {
+ dictionary = msgIdDictionary;
+ }
+ if (dictionary != null) {
+ dictionaryFile = fs.makeQualified(dictionaryFile);
+ for (Pair<Writable, IntWritable> record
+ : new SequenceFileIterable<Writable, IntWritable>(dictionaryFile, true, conf)) {
+ dictionary.put(record.getFirst().toString(), record.getSecond().get());
+ }
+ }
+ }
+
+ }
+
+ public static String[] parseReferences(CharSequence rawRefs) {
+ String[] splits;
+ if (rawRefs != null && rawRefs.length() > 0) {
+ splits = SPACE_OR_CLOSE_ANGLE.split(rawRefs);
+ for (int i = 0; i < splits.length; i++) {
+ splits[i] = ANGLE_BRACES.matcher(splits[i]).replaceAll("");
+ }
+ } else {
+ splits = EMPTY;
+ }
+ return splits;
+ }
+
+ public enum Counters {
+ NO_MESSAGE_ID, NO_FROM_ADDRESS
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/email/FromEmailToDictionaryMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/email/FromEmailToDictionaryMapper.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/email/FromEmailToDictionaryMapper.java
new file mode 100644
index 0000000..5cd308d
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/email/FromEmailToDictionaryMapper.java
@@ -0,0 +1,61 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.email;
+
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.mahout.math.VarIntWritable;
+
+import java.io.IOException;
+
+/**
+ * Assumes the input is in the format created by {@link org.apache.mahout.text.SequenceFilesFromMailArchives}
+ */
+public final class FromEmailToDictionaryMapper extends Mapper<Text, Text, Text, VarIntWritable> {
+
+ private String separator;
+
+ @Override
+ protected void setup(Context context) throws IOException, InterruptedException {
+ super.setup(context);
+ separator = context.getConfiguration().get(EmailUtility.SEPARATOR);
+ }
+
+ @Override
+ protected void map(Text key, Text value, Context context) throws IOException, InterruptedException {
+ //From is in the value
+ String valStr = value.toString();
+ int idx = valStr.indexOf(separator);
+ if (idx == -1) {
+ context.getCounter(EmailUtility.Counters.NO_FROM_ADDRESS).increment(1);
+ } else {
+ String full = valStr.substring(0, idx);
+ //do some cleanup to normalize some things, like: Key: karthik ananth <***@gmail.com>: Value: 178
+ //Key: karthik ananth [mailto:***@gmail.com]=20: Value: 179
+ //TODO: is there more to clean up here?
+ full = EmailUtility.cleanUpEmailAddress(full);
+
+ if (EmailUtility.WHITESPACE.matcher(full).matches()) {
+ context.getCounter(EmailUtility.Counters.NO_FROM_ADDRESS).increment(1);
+ } else {
+ context.write(new Text(full), new VarIntWritable(1));
+ }
+ }
+
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToDictionaryReducer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToDictionaryReducer.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToDictionaryReducer.java
new file mode 100644
index 0000000..72fcde9
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToDictionaryReducer.java
@@ -0,0 +1,43 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.example.email;
+
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.mahout.math.VarIntWritable;
+
+import java.io.IOException;
+
+/**
+ * Key: the string id
+ * Value: the count
+ * Out Key: the string id
+ * Out Value: the sum of the counts
+ */
+public final class MailToDictionaryReducer extends Reducer<Text, VarIntWritable, Text, VarIntWritable> {
+
+ @Override
+ protected void reduce(Text key, Iterable<VarIntWritable> values, Context context)
+ throws IOException, InterruptedException {
+ int sum = 0;
+ for (VarIntWritable value : values) {
+ sum += value.get();
+ }
+ context.write(new Text(key), new VarIntWritable(sum));
+ }
+}
r***@apache.org
2018-06-28 14:55:11 UTC
Permalink
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/streaming/tools/IOUtils.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/streaming/tools/IOUtils.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/streaming/tools/IOUtils.java
deleted file mode 100644
index bd1149b..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/streaming/tools/IOUtils.java
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.clustering.streaming.tools;
-
-import com.google.common.base.Function;
-import com.google.common.base.Preconditions;
-import com.google.common.collect.Iterables;
-import org.apache.mahout.clustering.iterator.ClusterWritable;
-import org.apache.mahout.clustering.streaming.mapreduce.CentroidWritable;
-import org.apache.mahout.math.Centroid;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.VectorWritable;
-
-public class IOUtils {
-
- private IOUtils() {}
-
- /**
- * Converts CentroidWritable values in a sequence file into Centroids lazily.
- * @param dirIterable the source iterable (comes from a SequenceFileDirIterable).
- * @return an Iterable<Centroid> with the converted vectors.
- */
- public static Iterable<Centroid> getCentroidsFromCentroidWritableIterable(
- Iterable<CentroidWritable> dirIterable) {
- return Iterables.transform(dirIterable, new Function<CentroidWritable, Centroid>() {
- @Override
- public Centroid apply(CentroidWritable input) {
- Preconditions.checkNotNull(input);
- return input.getCentroid().clone();
- }
- });
- }
-
- /**
- * Converts CentroidWritable values in a sequence file into Centroids lazily.
- * @param dirIterable the source iterable (comes from a SequenceFileDirIterable).
- * @return an Iterable<Centroid> with the converted vectors.
- */
- public static Iterable<Centroid> getCentroidsFromClusterWritableIterable(Iterable<ClusterWritable> dirIterable) {
- return Iterables.transform(dirIterable, new Function<ClusterWritable, Centroid>() {
- int numClusters = 0;
- @Override
- public Centroid apply(ClusterWritable input) {
- Preconditions.checkNotNull(input);
- return new Centroid(numClusters++, input.getValue().getCenter().clone(),
- input.getValue().getTotalObservations());
- }
- });
- }
-
- /**
- * Converts VectorWritable values in a sequence file into Vectors lazily.
- * @param dirIterable the source iterable (comes from a SequenceFileDirIterable).
- * @return an Iterable<Vector> with the converted vectors.
- */
- public static Iterable<Vector> getVectorsFromVectorWritableIterable(Iterable<VectorWritable> dirIterable) {
- return Iterables.transform(dirIterable, new Function<VectorWritable, Vector>() {
- @Override
- public Vector apply(VectorWritable input) {
- Preconditions.checkNotNull(input);
- return input.get().clone();
- }
- });
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java
deleted file mode 100644
index 083cd8c..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java
+++ /dev/null
@@ -1,125 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.clustering.syntheticcontrol.canopy;
-
-import java.util.List;
-import java.util.Map;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.util.ToolRunner;
-import org.apache.mahout.clustering.canopy.CanopyDriver;
-import org.apache.mahout.clustering.conversion.InputDriver;
-import org.apache.mahout.common.AbstractJob;
-import org.apache.mahout.common.ClassUtils;
-import org.apache.mahout.common.HadoopUtil;
-import org.apache.mahout.common.commandline.DefaultOptionCreator;
-import org.apache.mahout.common.distance.DistanceMeasure;
-import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
-import org.apache.mahout.utils.clustering.ClusterDumper;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-@Deprecated
-public final class Job extends AbstractJob {
-
- private static final String DIRECTORY_CONTAINING_CONVERTED_INPUT = "data";
-
- private Job() {
- }
-
- private static final Logger log = LoggerFactory.getLogger(Job.class);
-
- public static void main(String[] args) throws Exception {
- if (args.length > 0) {
- log.info("Running with only user-supplied arguments");
- ToolRunner.run(new Configuration(), new Job(), args);
- } else {
- log.info("Running with default arguments");
- Path output = new Path("output");
- HadoopUtil.delete(new Configuration(), output);
- run(new Path("testdata"), output, new EuclideanDistanceMeasure(), 80, 55);
- }
- }
-
- /**
- * Run the canopy clustering job on an input dataset using the given distance
- * measure, t1 and t2 parameters. All output data will be written to the
- * output directory, which will be initially deleted if it exists. The
- * clustered points will reside in the path <output>/clustered-points. By
- * default, the job expects the a file containing synthetic_control.data as
- * obtained from
- * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series
- * resides in a directory named "testdata", and writes output to a directory
- * named "output".
- *
- * @param input
- * the String denoting the input directory path
- * @param output
- * the String denoting the output directory path
- * @param measure
- * the DistanceMeasure to use
- * @param t1
- * the canopy T1 threshold
- * @param t2
- * the canopy T2 threshold
- */
- private static void run(Path input, Path output, DistanceMeasure measure,
- double t1, double t2) throws Exception {
- Path directoryContainingConvertedInput = new Path(output,
- DIRECTORY_CONTAINING_CONVERTED_INPUT);
- InputDriver.runJob(input, directoryContainingConvertedInput,
- "org.apache.mahout.math.RandomAccessSparseVector");
- CanopyDriver.run(new Configuration(), directoryContainingConvertedInput,
- output, measure, t1, t2, true, 0.0, false);
- // run ClusterDumper
- ClusterDumper clusterDumper = new ClusterDumper(new Path(output,
- "clusters-0-final"), new Path(output, "clusteredPoints"));
- clusterDumper.printClusters(null);
- }
-
- @Override
- public int run(String[] args) throws Exception {
-
- addInputOption();
- addOutputOption();
- addOption(DefaultOptionCreator.distanceMeasureOption().create());
- addOption(DefaultOptionCreator.t1Option().create());
- addOption(DefaultOptionCreator.t2Option().create());
- addOption(DefaultOptionCreator.overwriteOption().create());
-
- Map<String, List<String>> argMap = parseArguments(args);
- if (argMap == null) {
- return -1;
- }
-
- Path input = getInputPath();
- Path output = getOutputPath();
- if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
- HadoopUtil.delete(new Configuration(), output);
- }
- String measureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION);
- double t1 = Double.parseDouble(getOption(DefaultOptionCreator.T1_OPTION));
- double t2 = Double.parseDouble(getOption(DefaultOptionCreator.T2_OPTION));
- DistanceMeasure measure = ClassUtils.instantiateAs(measureClass, DistanceMeasure.class);
-
- run(input, output, measure, t1, t2);
- return 0;
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/fuzzykmeans/Job.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/fuzzykmeans/Job.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/fuzzykmeans/Job.java
deleted file mode 100644
index 43beb78..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/fuzzykmeans/Job.java
+++ /dev/null
@@ -1,144 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.clustering.syntheticcontrol.fuzzykmeans;
-
-import java.util.List;
-import java.util.Map;
-
-import org.apache.commons.cli2.builder.ArgumentBuilder;
-import org.apache.commons.cli2.builder.DefaultOptionBuilder;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.util.ToolRunner;
-import org.apache.mahout.clustering.canopy.CanopyDriver;
-import org.apache.mahout.clustering.conversion.InputDriver;
-import org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansDriver;
-import org.apache.mahout.common.AbstractJob;
-import org.apache.mahout.common.ClassUtils;
-import org.apache.mahout.common.HadoopUtil;
-import org.apache.mahout.common.commandline.DefaultOptionCreator;
-import org.apache.mahout.common.distance.DistanceMeasure;
-import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
-import org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure;
-import org.apache.mahout.utils.clustering.ClusterDumper;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-public final class Job extends AbstractJob {
-
- private static final Logger log = LoggerFactory.getLogger(Job.class);
-
- private static final String DIRECTORY_CONTAINING_CONVERTED_INPUT = "data";
-
- private static final String M_OPTION = FuzzyKMeansDriver.M_OPTION;
-
- private Job() {
- }
-
- public static void main(String[] args) throws Exception {
- if (args.length > 0) {
- log.info("Running with only user-supplied arguments");
- ToolRunner.run(new Configuration(), new Job(), args);
- } else {
- log.info("Running with default arguments");
- Path output = new Path("output");
- Configuration conf = new Configuration();
- HadoopUtil.delete(conf, output);
- run(conf, new Path("testdata"), output, new EuclideanDistanceMeasure(), 80, 55, 10, 2.0f, 0.5);
- }
- }
-
- @Override
- public int run(String[] args) throws Exception {
- addInputOption();
- addOutputOption();
- addOption(DefaultOptionCreator.distanceMeasureOption().create());
- addOption(DefaultOptionCreator.convergenceOption().create());
- addOption(DefaultOptionCreator.maxIterationsOption().create());
- addOption(DefaultOptionCreator.overwriteOption().create());
- addOption(DefaultOptionCreator.t1Option().create());
- addOption(DefaultOptionCreator.t2Option().create());
- addOption(M_OPTION, M_OPTION, "coefficient normalization factor, must be greater than 1", true);
-
- Map<String,List<String>> argMap = parseArguments(args);
- if (argMap == null) {
- return -1;
- }
-
- Path input = getInputPath();
- Path output = getOutputPath();
- String measureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION);
- if (measureClass == null) {
- measureClass = SquaredEuclideanDistanceMeasure.class.getName();
- }
- double convergenceDelta = Double.parseDouble(getOption(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION));
- int maxIterations = Integer.parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION));
- float fuzziness = Float.parseFloat(getOption(M_OPTION));
-
- addOption(new DefaultOptionBuilder().withLongName(M_OPTION).withRequired(true)
- .withArgument(new ArgumentBuilder().withName(M_OPTION).withMinimum(1).withMaximum(1).create())
- .withDescription("coefficient normalization factor, must be greater than 1").withShortName(M_OPTION).create());
- if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
- HadoopUtil.delete(getConf(), output);
- }
- DistanceMeasure measure = ClassUtils.instantiateAs(measureClass, DistanceMeasure.class);
- double t1 = Double.parseDouble(getOption(DefaultOptionCreator.T1_OPTION));
- double t2 = Double.parseDouble(getOption(DefaultOptionCreator.T2_OPTION));
- run(getConf(), input, output, measure, t1, t2, maxIterations, fuzziness, convergenceDelta);
- return 0;
- }
-
- /**
- * Run the kmeans clustering job on an input dataset using the given distance measure, t1, t2 and iteration
- * parameters. All output data will be written to the output directory, which will be initially deleted if it exists.
- * The clustered points will reside in the path <output>/clustered-points. By default, the job expects the a file
- * containing synthetic_control.data as obtained from
- * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series resides in a directory named "testdata",
- * and writes output to a directory named "output".
- *
- * @param input
- * the String denoting the input directory path
- * @param output
- * the String denoting the output directory path
- * @param t1
- * the canopy T1 threshold
- * @param t2
- * the canopy T2 threshold
- * @param maxIterations
- * the int maximum number of iterations
- * @param fuzziness
- * the float "m" fuzziness coefficient
- * @param convergenceDelta
- * the double convergence criteria for iterations
- */
- public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, double t1, double t2,
- int maxIterations, float fuzziness, double convergenceDelta) throws Exception {
- Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT);
- log.info("Preparing Input");
- InputDriver.runJob(input, directoryContainingConvertedInput, "org.apache.mahout.math.RandomAccessSparseVector");
- log.info("Running Canopy to get initial clusters");
- Path canopyOutput = new Path(output, "canopies");
- CanopyDriver.run(new Configuration(), directoryContainingConvertedInput, canopyOutput, measure, t1, t2, false, 0.0, false);
- log.info("Running FuzzyKMeans");
- FuzzyKMeansDriver.run(directoryContainingConvertedInput, new Path(canopyOutput, "clusters-0-final"), output,
- convergenceDelta, maxIterations, fuzziness, true, true, 0.0, false);
- // run ClusterDumper
- ClusterDumper clusterDumper = new ClusterDumper(new Path(output, "clusters-*-final"), new Path(output, "clusteredPoints"));
- clusterDumper.printClusters(null);
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java
deleted file mode 100644
index 70c41fe..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java
+++ /dev/null
@@ -1,187 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.clustering.syntheticcontrol.kmeans;
-
-import java.util.List;
-import java.util.Map;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.util.ToolRunner;
-import org.apache.mahout.clustering.Cluster;
-import org.apache.mahout.clustering.canopy.CanopyDriver;
-import org.apache.mahout.clustering.conversion.InputDriver;
-import org.apache.mahout.clustering.kmeans.KMeansDriver;
-import org.apache.mahout.clustering.kmeans.RandomSeedGenerator;
-import org.apache.mahout.common.AbstractJob;
-import org.apache.mahout.common.ClassUtils;
-import org.apache.mahout.common.HadoopUtil;
-import org.apache.mahout.common.commandline.DefaultOptionCreator;
-import org.apache.mahout.common.distance.DistanceMeasure;
-import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
-import org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure;
-import org.apache.mahout.utils.clustering.ClusterDumper;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-public final class Job extends AbstractJob {
-
- private static final Logger log = LoggerFactory.getLogger(Job.class);
-
- private static final String DIRECTORY_CONTAINING_CONVERTED_INPUT = "data";
-
- private Job() {
- }
-
- public static void main(String[] args) throws Exception {
- if (args.length > 0) {
- log.info("Running with only user-supplied arguments");
- ToolRunner.run(new Configuration(), new Job(), args);
- } else {
- log.info("Running with default arguments");
- Path output = new Path("output");
- Configuration conf = new Configuration();
- HadoopUtil.delete(conf, output);
- run(conf, new Path("testdata"), output, new EuclideanDistanceMeasure(), 6, 0.5, 10);
- }
- }
-
- @Override
- public int run(String[] args) throws Exception {
- addInputOption();
- addOutputOption();
- addOption(DefaultOptionCreator.distanceMeasureOption().create());
- addOption(DefaultOptionCreator.numClustersOption().create());
- addOption(DefaultOptionCreator.t1Option().create());
- addOption(DefaultOptionCreator.t2Option().create());
- addOption(DefaultOptionCreator.convergenceOption().create());
- addOption(DefaultOptionCreator.maxIterationsOption().create());
- addOption(DefaultOptionCreator.overwriteOption().create());
-
- Map<String,List<String>> argMap = parseArguments(args);
- if (argMap == null) {
- return -1;
- }
-
- Path input = getInputPath();
- Path output = getOutputPath();
- String measureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION);
- if (measureClass == null) {
- measureClass = SquaredEuclideanDistanceMeasure.class.getName();
- }
- double convergenceDelta = Double.parseDouble(getOption(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION));
- int maxIterations = Integer.parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION));
- if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
- HadoopUtil.delete(getConf(), output);
- }
- DistanceMeasure measure = ClassUtils.instantiateAs(measureClass, DistanceMeasure.class);
- if (hasOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION)) {
- int k = Integer.parseInt(getOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION));
- run(getConf(), input, output, measure, k, convergenceDelta, maxIterations);
- } else {
- double t1 = Double.parseDouble(getOption(DefaultOptionCreator.T1_OPTION));
- double t2 = Double.parseDouble(getOption(DefaultOptionCreator.T2_OPTION));
- run(getConf(), input, output, measure, t1, t2, convergenceDelta, maxIterations);
- }
- return 0;
- }
-
- /**
- * Run the kmeans clustering job on an input dataset using the given the number of clusters k and iteration
- * parameters. All output data will be written to the output directory, which will be initially deleted if it exists.
- * The clustered points will reside in the path <output>/clustered-points. By default, the job expects a file
- * containing equal length space delimited data that resides in a directory named "testdata", and writes output to a
- * directory named "output".
- *
- * @param conf
- * the Configuration to use
- * @param input
- * the String denoting the input directory path
- * @param output
- * the String denoting the output directory path
- * @param measure
- * the DistanceMeasure to use
- * @param k
- * the number of clusters in Kmeans
- * @param convergenceDelta
- * the double convergence criteria for iterations
- * @param maxIterations
- * the int maximum number of iterations
- */
- public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, int k,
- double convergenceDelta, int maxIterations) throws Exception {
- Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT);
- log.info("Preparing Input");
- InputDriver.runJob(input, directoryContainingConvertedInput, "org.apache.mahout.math.RandomAccessSparseVector");
- log.info("Running random seed to get initial clusters");
- Path clusters = new Path(output, "random-seeds");
- clusters = RandomSeedGenerator.buildRandom(conf, directoryContainingConvertedInput, clusters, k, measure);
- log.info("Running KMeans with k = {}", k);
- KMeansDriver.run(conf, directoryContainingConvertedInput, clusters, output, convergenceDelta,
- maxIterations, true, 0.0, false);
- // run ClusterDumper
- Path outGlob = new Path(output, "clusters-*-final");
- Path clusteredPoints = new Path(output,"clusteredPoints");
- log.info("Dumping out clusters from clusters: {} and clusteredPoints: {}", outGlob, clusteredPoints);
- ClusterDumper clusterDumper = new ClusterDumper(outGlob, clusteredPoints);
- clusterDumper.printClusters(null);
- }
-
- /**
- * Run the kmeans clustering job on an input dataset using the given distance measure, t1, t2 and iteration
- * parameters. All output data will be written to the output directory, which will be initially deleted if it exists.
- * The clustered points will reside in the path <output>/clustered-points. By default, the job expects the a file
- * containing synthetic_control.data as obtained from
- * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series resides in a directory named "testdata",
- * and writes output to a directory named "output".
- *
- * @param conf
- * the Configuration to use
- * @param input
- * the String denoting the input directory path
- * @param output
- * the String denoting the output directory path
- * @param measure
- * the DistanceMeasure to use
- * @param t1
- * the canopy T1 threshold
- * @param t2
- * the canopy T2 threshold
- * @param convergenceDelta
- * the double convergence criteria for iterations
- * @param maxIterations
- * the int maximum number of iterations
- */
- public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, double t1, double t2,
- double convergenceDelta, int maxIterations) throws Exception {
- Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT);
- log.info("Preparing Input");
- InputDriver.runJob(input, directoryContainingConvertedInput, "org.apache.mahout.math.RandomAccessSparseVector");
- log.info("Running Canopy to get initial clusters");
- Path canopyOutput = new Path(output, "canopies");
- CanopyDriver.run(new Configuration(), directoryContainingConvertedInput, canopyOutput, measure, t1, t2, false, 0.0,
- false);
- log.info("Running KMeans");
- KMeansDriver.run(conf, directoryContainingConvertedInput, new Path(canopyOutput, Cluster.INITIAL_CLUSTERS_DIR
- + "-final"), output, convergenceDelta, maxIterations, true, 0.0, false);
- // run ClusterDumper
- ClusterDumper clusterDumper = new ClusterDumper(new Path(output, "clusters-*-final"), new Path(output,
- "clusteredPoints"));
- clusterDumper.printClusters(null);
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/DeliciousTagsExample.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/DeliciousTagsExample.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/DeliciousTagsExample.java
deleted file mode 100644
index 92363e5..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/DeliciousTagsExample.java
+++ /dev/null
@@ -1,94 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.fpm.pfpgrowth;
-
-import java.io.IOException;
-
-import org.apache.commons.cli2.CommandLine;
-import org.apache.commons.cli2.Group;
-import org.apache.commons.cli2.Option;
-import org.apache.commons.cli2.OptionException;
-import org.apache.commons.cli2.builder.ArgumentBuilder;
-import org.apache.commons.cli2.builder.DefaultOptionBuilder;
-import org.apache.commons.cli2.builder.GroupBuilder;
-import org.apache.commons.cli2.commandline.Parser;
-import org.apache.mahout.common.CommandLineUtil;
-import org.apache.mahout.common.Parameters;
-import org.apache.mahout.common.commandline.DefaultOptionCreator;
-import org.apache.mahout.fpm.pfpgrowth.dataset.KeyBasedStringTupleGrouper;
-
-public final class DeliciousTagsExample {
- private DeliciousTagsExample() { }
-
- public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
- DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
- ArgumentBuilder abuilder = new ArgumentBuilder();
- GroupBuilder gbuilder = new GroupBuilder();
- Option inputDirOpt = DefaultOptionCreator.inputOption().create();
-
- Option outputOpt = DefaultOptionCreator.outputOption().create();
-
- Option helpOpt = DefaultOptionCreator.helpOption();
- Option recordSplitterOpt = obuilder.withLongName("splitterPattern").withArgument(
- abuilder.withName("splitterPattern").withMinimum(1).withMaximum(1).create()).withDescription(
- "Regular Expression pattern used to split given line into fields."
- + " Default value splits comma or tab separated fields."
- + " Default Value: \"[ ,\\t]*\\t[ ,\\t]*\" ").withShortName("regex").create();
- Option encodingOpt = obuilder.withLongName("encoding").withArgument(
- abuilder.withName("encoding").withMinimum(1).withMaximum(1).create()).withDescription(
- "(Optional) The file encoding. Default value: UTF-8").withShortName("e").create();
- Group group = gbuilder.withName("Options").withOption(inputDirOpt).withOption(outputOpt).withOption(
- helpOpt).withOption(recordSplitterOpt).withOption(encodingOpt).create();
-
- try {
- Parser parser = new Parser();
- parser.setGroup(group);
- CommandLine cmdLine = parser.parse(args);
-
- if (cmdLine.hasOption(helpOpt)) {
- CommandLineUtil.printHelp(group);
- return;
- }
- Parameters params = new Parameters();
- if (cmdLine.hasOption(recordSplitterOpt)) {
- params.set("splitPattern", (String) cmdLine.getValue(recordSplitterOpt));
- }
-
- String encoding = "UTF-8";
- if (cmdLine.hasOption(encodingOpt)) {
- encoding = (String) cmdLine.getValue(encodingOpt);
- }
- params.set("encoding", encoding);
- String inputDir = (String) cmdLine.getValue(inputDirOpt);
- String outputDir = (String) cmdLine.getValue(outputOpt);
- params.set("input", inputDir);
- params.set("output", outputDir);
- params.set("groupingFieldCount", "2");
- params.set("gfield0", "1");
- params.set("gfield1", "2");
- params.set("selectedFieldCount", "1");
- params.set("field0", "3");
- params.set("maxTransactionLength", "100");
- KeyBasedStringTupleGrouper.startJob(params);
-
- } catch (OptionException ex) {
- CommandLineUtil.printHelp(group);
- }
-
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleCombiner.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleCombiner.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleCombiner.java
deleted file mode 100644
index 4c80a31..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleCombiner.java
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.fpm.pfpgrowth.dataset;
-
-import java.io.IOException;
-import java.util.HashSet;
-import java.util.Set;
-
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.Reducer;
-import org.apache.mahout.common.StringTuple;
-
-public class KeyBasedStringTupleCombiner extends Reducer<Text,StringTuple,Text,StringTuple> {
-
- @Override
- protected void reduce(Text key,
- Iterable<StringTuple> values,
- Context context) throws IOException, InterruptedException {
- Set<String> outputValues = new HashSet<>();
- for (StringTuple value : values) {
- outputValues.addAll(value.getEntries());
- }
- context.write(key, new StringTuple(outputValues));
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleGrouper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleGrouper.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleGrouper.java
deleted file mode 100644
index cd17770..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleGrouper.java
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.fpm.pfpgrowth.dataset;
-
-import java.io.IOException;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.Job;
-import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
-import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
-import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
-import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
-import org.apache.mahout.common.HadoopUtil;
-import org.apache.mahout.common.Parameters;
-import org.apache.mahout.common.StringTuple;
-
-public final class KeyBasedStringTupleGrouper {
-
- private KeyBasedStringTupleGrouper() { }
-
- public static void startJob(Parameters params) throws IOException,
- InterruptedException,
- ClassNotFoundException {
- Configuration conf = new Configuration();
-
- conf.set("job.parameters", params.toString());
- conf.set("mapred.compress.map.output", "true");
- conf.set("mapred.output.compression.type", "BLOCK");
- conf.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec");
- conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
- + "org.apache.hadoop.io.serializer.WritableSerialization");
-
- String input = params.get("input");
- Job job = new Job(conf, "Generating dataset based from input" + input);
- job.setJarByClass(KeyBasedStringTupleGrouper.class);
-
- job.setMapOutputKeyClass(Text.class);
- job.setMapOutputValueClass(StringTuple.class);
-
- job.setOutputKeyClass(Text.class);
- job.setOutputValueClass(Text.class);
-
- FileInputFormat.addInputPath(job, new Path(input));
- Path outPath = new Path(params.get("output"));
- FileOutputFormat.setOutputPath(job, outPath);
-
- HadoopUtil.delete(conf, outPath);
-
- job.setInputFormatClass(TextInputFormat.class);
- job.setMapperClass(KeyBasedStringTupleMapper.class);
- job.setCombinerClass(KeyBasedStringTupleCombiner.class);
- job.setReducerClass(KeyBasedStringTupleReducer.class);
- job.setOutputFormatClass(TextOutputFormat.class);
-
- boolean succeeded = job.waitForCompletion(true);
- if (!succeeded) {
- throw new IllegalStateException("Job failed!");
- }
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleMapper.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleMapper.java
deleted file mode 100644
index 362d1ce..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleMapper.java
+++ /dev/null
@@ -1,90 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.fpm.pfpgrowth.dataset;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.List;
-import java.util.regex.Pattern;
-
-import org.apache.hadoop.io.LongWritable;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.Mapper;
-import org.apache.mahout.common.Parameters;
-import org.apache.mahout.common.StringTuple;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * Splits the line using a {@link Pattern} and outputs key as given by the groupingFields
- *
- */
-public class KeyBasedStringTupleMapper extends Mapper<LongWritable,Text,Text,StringTuple> {
-
- private static final Logger log = LoggerFactory.getLogger(KeyBasedStringTupleMapper.class);
-
- private Pattern splitter;
-
- private int[] selectedFields;
-
- private int[] groupingFields;
-
- @Override
- protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
- String[] fields = splitter.split(value.toString());
- if (fields.length != 4) {
- log.info("{} {}", fields.length, value.toString());
- context.getCounter("Map", "ERROR").increment(1);
- return;
- }
- Collection<String> oKey = new ArrayList<>();
- for (int groupingField : groupingFields) {
- oKey.add(fields[groupingField]);
- context.setStatus(fields[groupingField]);
- }
-
- List<String> oValue = new ArrayList<>();
- for (int selectedField : selectedFields) {
- oValue.add(fields[selectedField]);
- }
-
- context.write(new Text(oKey.toString()), new StringTuple(oValue));
-
- }
-
- @Override
- protected void setup(Context context) throws IOException, InterruptedException {
- super.setup(context);
- Parameters params = new Parameters(context.getConfiguration().get("job.parameters", ""));
- splitter = Pattern.compile(params.get("splitPattern", "[ \t]*\t[ \t]*"));
-
- int selectedFieldCount = Integer.valueOf(params.get("selectedFieldCount", "0"));
- selectedFields = new int[selectedFieldCount];
- for (int i = 0; i < selectedFieldCount; i++) {
- selectedFields[i] = Integer.valueOf(params.get("field" + i, "0"));
- }
-
- int groupingFieldCount = Integer.valueOf(params.get("groupingFieldCount", "0"));
- groupingFields = new int[groupingFieldCount];
- for (int i = 0; i < groupingFieldCount; i++) {
- groupingFields[i] = Integer.valueOf(params.get("gfield" + i, "0"));
- }
-
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleReducer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleReducer.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleReducer.java
deleted file mode 100644
index a7ef762..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/fpm/pfpgrowth/dataset/KeyBasedStringTupleReducer.java
+++ /dev/null
@@ -1,74 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.fpm.pfpgrowth.dataset;
-
-import java.io.IOException;
-import java.util.Collection;
-import java.util.HashSet;
-
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.Reducer;
-import org.apache.mahout.common.Parameters;
-import org.apache.mahout.common.StringTuple;
-
-public class KeyBasedStringTupleReducer extends Reducer<Text,StringTuple,Text,Text> {
-
- private int maxTransactionLength = 100;
-
- @Override
- protected void reduce(Text key, Iterable<StringTuple> values, Context context)
- throws IOException, InterruptedException {
- Collection<String> items = new HashSet<>();
-
- for (StringTuple value : values) {
- for (String field : value.getEntries()) {
- items.add(field);
- }
- }
- if (items.size() > 1) {
- int i = 0;
- StringBuilder sb = new StringBuilder();
- String sep = "";
- for (String field : items) {
- if (i % maxTransactionLength == 0) {
- if (i != 0) {
- context.write(null, new Text(sb.toString()));
- }
- sb.replace(0, sb.length(), "");
- sep = "";
- }
-
- sb.append(sep).append(field);
- sep = "\t";
-
- i++;
-
- }
- if (sb.length() > 0) {
- context.write(null, new Text(sb.toString()));
- }
- }
- }
-
- @Override
- protected void setup(Context context) throws IOException, InterruptedException {
- super.setup(context);
- Parameters params = new Parameters(context.getConfiguration().get("job.parameters", ""));
- maxTransactionLength = Integer.valueOf(params.get("maxTransactionLength", "100"));
- }
-}
r***@apache.org
2018-06-28 14:55:02 UTC
Permalink
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/AdaptiveLogisticModelParameters.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/AdaptiveLogisticModelParameters.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/AdaptiveLogisticModelParameters.java
new file mode 100644
index 0000000..b2ce8b1
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/AdaptiveLogisticModelParameters.java
@@ -0,0 +1,236 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import org.apache.mahout.math.stats.GlobalOnlineAuc;
+import org.apache.mahout.math.stats.GroupedOnlineAuc;
+import org.apache.mahout.math.stats.OnlineAuc;
+
+import java.io.DataInput;
+import java.io.DataInputStream;
+import java.io.DataOutput;
+import java.io.DataOutputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+
+public class AdaptiveLogisticModelParameters extends LogisticModelParameters {
+
+ private AdaptiveLogisticRegression alr;
+ private int interval = 800;
+ private int averageWindow = 500;
+ private int threads = 4;
+ private String prior = "L1";
+ private double priorOption = Double.NaN;
+ private String auc = null;
+
+ public AdaptiveLogisticRegression createAdaptiveLogisticRegression() {
+
+ if (alr == null) {
+ alr = new AdaptiveLogisticRegression(getMaxTargetCategories(),
+ getNumFeatures(), createPrior(prior, priorOption));
+ alr.setInterval(interval);
+ alr.setAveragingWindow(averageWindow);
+ alr.setThreadCount(threads);
+ alr.setAucEvaluator(createAUC(auc));
+ }
+ return alr;
+ }
+
+ public void checkParameters() {
+ if (prior != null) {
+ String priorUppercase = prior.toUpperCase(Locale.ENGLISH).trim();
+ if (("TP".equals(priorUppercase) || "EBP".equals(priorUppercase)) && Double.isNaN(priorOption)) {
+ throw new IllegalArgumentException("You must specify a double value for TPrior and ElasticBandPrior.");
+ }
+ }
+ }
+
+ private static PriorFunction createPrior(String cmd, double priorOption) {
+ if (cmd == null) {
+ return null;
+ }
+ if ("L1".equals(cmd.toUpperCase(Locale.ENGLISH).trim())) {
+ return new L1();
+ }
+ if ("L2".equals(cmd.toUpperCase(Locale.ENGLISH).trim())) {
+ return new L2();
+ }
+ if ("UP".equals(cmd.toUpperCase(Locale.ENGLISH).trim())) {
+ return new UniformPrior();
+ }
+ if ("TP".equals(cmd.toUpperCase(Locale.ENGLISH).trim())) {
+ return new TPrior(priorOption);
+ }
+ if ("EBP".equals(cmd.toUpperCase(Locale.ENGLISH).trim())) {
+ return new ElasticBandPrior(priorOption);
+ }
+
+ return null;
+ }
+
+ private static OnlineAuc createAUC(String cmd) {
+ if (cmd == null) {
+ return null;
+ }
+ if ("GLOBAL".equals(cmd.toUpperCase(Locale.ENGLISH).trim())) {
+ return new GlobalOnlineAuc();
+ }
+ if ("GROUPED".equals(cmd.toUpperCase(Locale.ENGLISH).trim())) {
+ return new GroupedOnlineAuc();
+ }
+ return null;
+ }
+
+ @Override
+ public void saveTo(OutputStream out) throws IOException {
+ if (alr != null) {
+ alr.close();
+ }
+ setTargetCategories(getCsvRecordFactory().getTargetCategories());
+ write(new DataOutputStream(out));
+ }
+
+ @Override
+ public void write(DataOutput out) throws IOException {
+ out.writeUTF(getTargetVariable());
+ out.writeInt(getTypeMap().size());
+ for (Map.Entry<String, String> entry : getTypeMap().entrySet()) {
+ out.writeUTF(entry.getKey());
+ out.writeUTF(entry.getValue());
+ }
+ out.writeInt(getNumFeatures());
+ out.writeInt(getMaxTargetCategories());
+ out.writeInt(getTargetCategories().size());
+ for (String category : getTargetCategories()) {
+ out.writeUTF(category);
+ }
+
+ out.writeInt(interval);
+ out.writeInt(averageWindow);
+ out.writeInt(threads);
+ out.writeUTF(prior);
+ out.writeDouble(priorOption);
+ out.writeUTF(auc);
+
+ // skip csv
+ alr.write(out);
+ }
+
+ @Override
+ public void readFields(DataInput in) throws IOException {
+ setTargetVariable(in.readUTF());
+ int typeMapSize = in.readInt();
+ Map<String, String> typeMap = new HashMap<>(typeMapSize);
+ for (int i = 0; i < typeMapSize; i++) {
+ String key = in.readUTF();
+ String value = in.readUTF();
+ typeMap.put(key, value);
+ }
+ setTypeMap(typeMap);
+
+ setNumFeatures(in.readInt());
+ setMaxTargetCategories(in.readInt());
+ int targetCategoriesSize = in.readInt();
+ List<String> targetCategories = new ArrayList<>(targetCategoriesSize);
+ for (int i = 0; i < targetCategoriesSize; i++) {
+ targetCategories.add(in.readUTF());
+ }
+ setTargetCategories(targetCategories);
+
+ interval = in.readInt();
+ averageWindow = in.readInt();
+ threads = in.readInt();
+ prior = in.readUTF();
+ priorOption = in.readDouble();
+ auc = in.readUTF();
+
+ alr = new AdaptiveLogisticRegression();
+ alr.readFields(in);
+ }
+
+
+ private static AdaptiveLogisticModelParameters loadFromStream(InputStream in) throws IOException {
+ AdaptiveLogisticModelParameters result = new AdaptiveLogisticModelParameters();
+ result.readFields(new DataInputStream(in));
+ return result;
+ }
+
+ public static AdaptiveLogisticModelParameters loadFromFile(File in) throws IOException {
+ try (InputStream input = new FileInputStream(in)) {
+ return loadFromStream(input);
+ }
+ }
+
+ public int getInterval() {
+ return interval;
+ }
+
+ public void setInterval(int interval) {
+ this.interval = interval;
+ }
+
+ public int getAverageWindow() {
+ return averageWindow;
+ }
+
+ public void setAverageWindow(int averageWindow) {
+ this.averageWindow = averageWindow;
+ }
+
+ public int getThreads() {
+ return threads;
+ }
+
+ public void setThreads(int threads) {
+ this.threads = threads;
+ }
+
+ public String getPrior() {
+ return prior;
+ }
+
+ public void setPrior(String prior) {
+ this.prior = prior;
+ }
+
+ public String getAuc() {
+ return auc;
+ }
+
+ public void setAuc(String auc) {
+ this.auc = auc;
+ }
+
+ public double getPriorOption() {
+ return priorOption;
+ }
+
+ public void setPriorOption(double priorOption) {
+ this.priorOption = priorOption;
+ }
+
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/LogisticModelParameters.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/LogisticModelParameters.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/LogisticModelParameters.java
new file mode 100644
index 0000000..e762924
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/LogisticModelParameters.java
@@ -0,0 +1,265 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import com.google.common.base.Preconditions;
+import com.google.common.io.Closeables;
+import java.io.DataInput;
+import java.io.DataInputStream;
+import java.io.DataOutput;
+import java.io.DataOutputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import org.apache.hadoop.io.Writable;
+
+/**
+ * Encapsulates everything we need to know about a model and how it reads and vectorizes its input.
+ * This encapsulation allows us to coherently save and restore a model from a file. This also
+ * allows us to keep command line arguments that affect learning in a coherent way.
+ */
+public class LogisticModelParameters implements Writable {
+ private String targetVariable;
+ private Map<String, String> typeMap;
+ private int numFeatures;
+ private boolean useBias;
+ private int maxTargetCategories;
+ private List<String> targetCategories;
+ private double lambda;
+ private double learningRate;
+ private CsvRecordFactory csv;
+ private OnlineLogisticRegression lr;
+
+ /**
+ * Returns a CsvRecordFactory compatible with this logistic model. The reason that this is tied
+ * in here is so that we have access to the list of target categories when it comes time to save
+ * the model. If the input isn't CSV, then calling setTargetCategories before calling saveTo will
+ * suffice.
+ *
+ * @return The CsvRecordFactory.
+ */
+ public CsvRecordFactory getCsvRecordFactory() {
+ if (csv == null) {
+ csv = new CsvRecordFactory(getTargetVariable(), getTypeMap())
+ .maxTargetValue(getMaxTargetCategories())
+ .includeBiasTerm(useBias());
+ if (targetCategories != null) {
+ csv.defineTargetCategories(targetCategories);
+ }
+ }
+ return csv;
+ }
+
+ /**
+ * Creates a logistic regression trainer using the parameters collected here.
+ *
+ * @return The newly allocated OnlineLogisticRegression object
+ */
+ public OnlineLogisticRegression createRegression() {
+ if (lr == null) {
+ lr = new OnlineLogisticRegression(getMaxTargetCategories(), getNumFeatures(), new L1())
+ .lambda(getLambda())
+ .learningRate(getLearningRate())
+ .alpha(1 - 1.0e-3);
+ }
+ return lr;
+ }
+
+ /**
+ * Saves a model to an output stream.
+ */
+ public void saveTo(OutputStream out) throws IOException {
+ Closeables.close(lr, false);
+ targetCategories = getCsvRecordFactory().getTargetCategories();
+ write(new DataOutputStream(out));
+ }
+
+ /**
+ * Reads a model from a stream.
+ */
+ public static LogisticModelParameters loadFrom(InputStream in) throws IOException {
+ LogisticModelParameters result = new LogisticModelParameters();
+ result.readFields(new DataInputStream(in));
+ return result;
+ }
+
+ /**
+ * Reads a model from a file.
+ * @throws IOException If there is an error opening or closing the file.
+ */
+ public static LogisticModelParameters loadFrom(File in) throws IOException {
+ try (InputStream input = new FileInputStream(in)) {
+ return loadFrom(input);
+ }
+ }
+
+
+ @Override
+ public void write(DataOutput out) throws IOException {
+ out.writeUTF(targetVariable);
+ out.writeInt(typeMap.size());
+ for (Map.Entry<String,String> entry : typeMap.entrySet()) {
+ out.writeUTF(entry.getKey());
+ out.writeUTF(entry.getValue());
+ }
+ out.writeInt(numFeatures);
+ out.writeBoolean(useBias);
+ out.writeInt(maxTargetCategories);
+
+ if (targetCategories == null) {
+ out.writeInt(0);
+ } else {
+ out.writeInt(targetCategories.size());
+ for (String category : targetCategories) {
+ out.writeUTF(category);
+ }
+ }
+ out.writeDouble(lambda);
+ out.writeDouble(learningRate);
+ // skip csv
+ lr.write(out);
+ }
+
+ @Override
+ public void readFields(DataInput in) throws IOException {
+ targetVariable = in.readUTF();
+ int typeMapSize = in.readInt();
+ typeMap = new HashMap<>(typeMapSize);
+ for (int i = 0; i < typeMapSize; i++) {
+ String key = in.readUTF();
+ String value = in.readUTF();
+ typeMap.put(key, value);
+ }
+ numFeatures = in.readInt();
+ useBias = in.readBoolean();
+ maxTargetCategories = in.readInt();
+ int targetCategoriesSize = in.readInt();
+ targetCategories = new ArrayList<>(targetCategoriesSize);
+ for (int i = 0; i < targetCategoriesSize; i++) {
+ targetCategories.add(in.readUTF());
+ }
+ lambda = in.readDouble();
+ learningRate = in.readDouble();
+ csv = null;
+ lr = new OnlineLogisticRegression();
+ lr.readFields(in);
+ }
+
+ /**
+ * Sets the types of the predictors. This will later be used when reading CSV data. If you don't
+ * use the CSV data and convert to vectors on your own, you don't need to call this.
+ *
+ * @param predictorList The list of variable names.
+ * @param typeList The list of types in the format preferred by CsvRecordFactory.
+ */
+ public void setTypeMap(Iterable<String> predictorList, List<String> typeList) {
+ Preconditions.checkArgument(!typeList.isEmpty(), "Must have at least one type specifier");
+ typeMap = new HashMap<>();
+ Iterator<String> iTypes = typeList.iterator();
+ String lastType = null;
+ for (Object x : predictorList) {
+ // type list can be short .. we just repeat last spec
+ if (iTypes.hasNext()) {
+ lastType = iTypes.next();
+ }
+ typeMap.put(x.toString(), lastType);
+ }
+ }
+
+ /**
+ * Sets the target variable. If you don't use the CSV record factory, then this is irrelevant.
+ *
+ * @param targetVariable The name of the target variable.
+ */
+ public void setTargetVariable(String targetVariable) {
+ this.targetVariable = targetVariable;
+ }
+
+ /**
+ * Sets the number of target categories to be considered.
+ *
+ * @param maxTargetCategories The number of target categories.
+ */
+ public void setMaxTargetCategories(int maxTargetCategories) {
+ this.maxTargetCategories = maxTargetCategories;
+ }
+
+ public void setNumFeatures(int numFeatures) {
+ this.numFeatures = numFeatures;
+ }
+
+ public void setTargetCategories(List<String> targetCategories) {
+ this.targetCategories = targetCategories;
+ maxTargetCategories = targetCategories.size();
+ }
+
+ public List<String> getTargetCategories() {
+ return this.targetCategories;
+ }
+
+ public void setUseBias(boolean useBias) {
+ this.useBias = useBias;
+ }
+
+ public boolean useBias() {
+ return useBias;
+ }
+
+ public String getTargetVariable() {
+ return targetVariable;
+ }
+
+ public Map<String, String> getTypeMap() {
+ return typeMap;
+ }
+
+ public void setTypeMap(Map<String, String> map) {
+ this.typeMap = map;
+ }
+
+ public int getNumFeatures() {
+ return numFeatures;
+ }
+
+ public int getMaxTargetCategories() {
+ return maxTargetCategories;
+ }
+
+ public double getLambda() {
+ return lambda;
+ }
+
+ public void setLambda(double lambda) {
+ this.lambda = lambda;
+ }
+
+ public double getLearningRate() {
+ return learningRate;
+ }
+
+ public void setLearningRate(double learningRate) {
+ this.learningRate = learningRate;
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/PrintResourceOrFile.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/PrintResourceOrFile.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/PrintResourceOrFile.java
new file mode 100644
index 0000000..3ec6a06
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/PrintResourceOrFile.java
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import com.google.common.base.Preconditions;
+
+import java.io.BufferedReader;
+
+/**
+ * Uses the same logic as TrainLogistic and RunLogistic for finding an input, but instead
+ * of processing the input, this class just prints the input to standard out.
+ */
+public final class PrintResourceOrFile {
+
+ private PrintResourceOrFile() {
+ }
+
+ public static void main(String[] args) throws Exception {
+ Preconditions.checkArgument(args.length == 1, "Must have a single argument that names a file or resource.");
+ try (BufferedReader in = TrainLogistic.open(args[0])){
+ String line;
+ while ((line = in.readLine()) != null) {
+ System.out.println(line);
+ }
+ }
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/RunAdaptiveLogistic.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/RunAdaptiveLogistic.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/RunAdaptiveLogistic.java
new file mode 100644
index 0000000..678a8f5
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/RunAdaptiveLogistic.java
@@ -0,0 +1,197 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.commandline.Parser;
+import org.apache.commons.cli2.util.HelpFormatter;
+import org.apache.commons.io.Charsets;
+import org.apache.mahout.classifier.sgd.AdaptiveLogisticRegression.Wrapper;
+import org.apache.mahout.ep.State;
+import org.apache.mahout.math.SequentialAccessSparseVector;
+import org.apache.mahout.math.Vector;
+
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.OutputStreamWriter;
+import java.io.PrintWriter;
+import java.util.HashMap;
+import java.util.Map;
+
+public final class RunAdaptiveLogistic {
+
+ private static String inputFile;
+ private static String modelFile;
+ private static String outputFile;
+ private static String idColumn;
+ private static boolean maxScoreOnly;
+
+ private RunAdaptiveLogistic() {
+ }
+
+ public static void main(String[] args) throws Exception {
+ mainToOutput(args, new PrintWriter(new OutputStreamWriter(System.out, Charsets.UTF_8), true));
+ }
+
+ static void mainToOutput(String[] args, PrintWriter output) throws Exception {
+ if (!parseArgs(args)) {
+ return;
+ }
+ AdaptiveLogisticModelParameters lmp = AdaptiveLogisticModelParameters
+ .loadFromFile(new File(modelFile));
+
+ CsvRecordFactory csv = lmp.getCsvRecordFactory();
+ csv.setIdName(idColumn);
+
+ AdaptiveLogisticRegression lr = lmp.createAdaptiveLogisticRegression();
+
+ State<Wrapper, CrossFoldLearner> best = lr.getBest();
+ if (best == null) {
+ output.println("AdaptiveLogisticRegression has not be trained probably.");
+ return;
+ }
+ CrossFoldLearner learner = best.getPayload().getLearner();
+
+ BufferedReader in = TrainAdaptiveLogistic.open(inputFile);
+ int k = 0;
+
+ try (BufferedWriter out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outputFile),
+ Charsets.UTF_8))) {
+ out.write(idColumn + ",target,score");
+ out.newLine();
+
+ String line = in.readLine();
+ csv.firstLine(line);
+ line = in.readLine();
+ Map<String, Double> results = new HashMap<>();
+ while (line != null) {
+ Vector v = new SequentialAccessSparseVector(lmp.getNumFeatures());
+ csv.processLine(line, v, false);
+ Vector scores = learner.classifyFull(v);
+ results.clear();
+ if (maxScoreOnly) {
+ results.put(csv.getTargetLabel(scores.maxValueIndex()),
+ scores.maxValue());
+ } else {
+ for (int i = 0; i < scores.size(); i++) {
+ results.put(csv.getTargetLabel(i), scores.get(i));
+ }
+ }
+
+ for (Map.Entry<String, Double> entry : results.entrySet()) {
+ out.write(csv.getIdString(line) + ',' + entry.getKey() + ',' + entry.getValue());
+ out.newLine();
+ }
+ k++;
+ if (k % 100 == 0) {
+ output.println(k + " records processed");
+ }
+ line = in.readLine();
+ }
+ out.flush();
+ }
+ output.println(k + " records processed totally.");
+ }
+
+ private static boolean parseArgs(String[] args) {
+ DefaultOptionBuilder builder = new DefaultOptionBuilder();
+
+ Option help = builder.withLongName("help")
+ .withDescription("print this list").create();
+
+ Option quiet = builder.withLongName("quiet")
+ .withDescription("be extra quiet").create();
+
+ ArgumentBuilder argumentBuilder = new ArgumentBuilder();
+ Option inputFileOption = builder
+ .withLongName("input")
+ .withRequired(true)
+ .withArgument(
+ argumentBuilder.withName("input").withMaximum(1)
+ .create())
+ .withDescription("where to get training data").create();
+
+ Option modelFileOption = builder
+ .withLongName("model")
+ .withRequired(true)
+ .withArgument(
+ argumentBuilder.withName("model").withMaximum(1)
+ .create())
+ .withDescription("where to get the trained model").create();
+
+ Option outputFileOption = builder
+ .withLongName("output")
+ .withRequired(true)
+ .withDescription("the file path to output scores")
+ .withArgument(argumentBuilder.withName("output").withMaximum(1).create())
+ .create();
+
+ Option idColumnOption = builder
+ .withLongName("idcolumn")
+ .withRequired(true)
+ .withDescription("the name of the id column for each record")
+ .withArgument(argumentBuilder.withName("idcolumn").withMaximum(1).create())
+ .create();
+
+ Option maxScoreOnlyOption = builder
+ .withLongName("maxscoreonly")
+ .withDescription("only output the target label with max scores")
+ .create();
+
+ Group normalArgs = new GroupBuilder()
+ .withOption(help).withOption(quiet)
+ .withOption(inputFileOption).withOption(modelFileOption)
+ .withOption(outputFileOption).withOption(idColumnOption)
+ .withOption(maxScoreOnlyOption)
+ .create();
+
+ Parser parser = new Parser();
+ parser.setHelpOption(help);
+ parser.setHelpTrigger("--help");
+ parser.setGroup(normalArgs);
+ parser.setHelpFormatter(new HelpFormatter(" ", "", " ", 130));
+ CommandLine cmdLine = parser.parseAndHelp(args);
+
+ if (cmdLine == null) {
+ return false;
+ }
+
+ inputFile = getStringArgument(cmdLine, inputFileOption);
+ modelFile = getStringArgument(cmdLine, modelFileOption);
+ outputFile = getStringArgument(cmdLine, outputFileOption);
+ idColumn = getStringArgument(cmdLine, idColumnOption);
+ maxScoreOnly = getBooleanArgument(cmdLine, maxScoreOnlyOption);
+ return true;
+ }
+
+ private static boolean getBooleanArgument(CommandLine cmdLine, Option option) {
+ return cmdLine.hasOption(option);
+ }
+
+ private static String getStringArgument(CommandLine cmdLine, Option inputFile) {
+ return (String) cmdLine.getValue(inputFile);
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/RunLogistic.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/RunLogistic.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/RunLogistic.java
new file mode 100644
index 0000000..2d57016
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/RunLogistic.java
@@ -0,0 +1,163 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.commandline.Parser;
+import org.apache.commons.cli2.util.HelpFormatter;
+import org.apache.commons.io.Charsets;
+import org.apache.mahout.classifier.evaluation.Auc;
+import org.apache.mahout.math.Matrix;
+import org.apache.mahout.math.SequentialAccessSparseVector;
+import org.apache.mahout.math.Vector;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.OutputStreamWriter;
+import java.io.PrintWriter;
+import java.util.Locale;
+
+public final class RunLogistic {
+
+ private static String inputFile;
+ private static String modelFile;
+ private static boolean showAuc;
+ private static boolean showScores;
+ private static boolean showConfusion;
+
+ private RunLogistic() {
+ }
+
+ public static void main(String[] args) throws Exception {
+ mainToOutput(args, new PrintWriter(new OutputStreamWriter(System.out, Charsets.UTF_8), true));
+ }
+
+ static void mainToOutput(String[] args, PrintWriter output) throws Exception {
+ if (parseArgs(args)) {
+ if (!showAuc && !showConfusion && !showScores) {
+ showAuc = true;
+ showConfusion = true;
+ }
+
+ Auc collector = new Auc();
+ LogisticModelParameters lmp = LogisticModelParameters.loadFrom(new File(modelFile));
+
+ CsvRecordFactory csv = lmp.getCsvRecordFactory();
+ OnlineLogisticRegression lr = lmp.createRegression();
+ BufferedReader in = TrainLogistic.open(inputFile);
+ String line = in.readLine();
+ csv.firstLine(line);
+ line = in.readLine();
+ if (showScores) {
+ output.println("\"target\",\"model-output\",\"log-likelihood\"");
+ }
+ while (line != null) {
+ Vector v = new SequentialAccessSparseVector(lmp.getNumFeatures());
+ int target = csv.processLine(line, v);
+
+ double score = lr.classifyScalar(v);
+ if (showScores) {
+ output.printf(Locale.ENGLISH, "%d,%.3f,%.6f%n", target, score, lr.logLikelihood(target, v));
+ }
+ collector.add(target, score);
+ line = in.readLine();
+ }
+
+ if (showAuc) {
+ output.printf(Locale.ENGLISH, "AUC = %.2f%n", collector.auc());
+ }
+ if (showConfusion) {
+ Matrix m = collector.confusion();
+ output.printf(Locale.ENGLISH, "confusion: [[%.1f, %.1f], [%.1f, %.1f]]%n",
+ m.get(0, 0), m.get(1, 0), m.get(0, 1), m.get(1, 1));
+ m = collector.entropy();
+ output.printf(Locale.ENGLISH, "entropy: [[%.1f, %.1f], [%.1f, %.1f]]%n",
+ m.get(0, 0), m.get(1, 0), m.get(0, 1), m.get(1, 1));
+ }
+ }
+ }
+
+ private static boolean parseArgs(String[] args) {
+ DefaultOptionBuilder builder = new DefaultOptionBuilder();
+
+ Option help = builder.withLongName("help").withDescription("print this list").create();
+
+ Option quiet = builder.withLongName("quiet").withDescription("be extra quiet").create();
+
+ Option auc = builder.withLongName("auc").withDescription("print AUC").create();
+ Option confusion = builder.withLongName("confusion").withDescription("print confusion matrix").create();
+
+ Option scores = builder.withLongName("scores").withDescription("print scores").create();
+
+ ArgumentBuilder argumentBuilder = new ArgumentBuilder();
+ Option inputFileOption = builder.withLongName("input")
+ .withRequired(true)
+ .withArgument(argumentBuilder.withName("input").withMaximum(1).create())
+ .withDescription("where to get training data")
+ .create();
+
+ Option modelFileOption = builder.withLongName("model")
+ .withRequired(true)
+ .withArgument(argumentBuilder.withName("model").withMaximum(1).create())
+ .withDescription("where to get a model")
+ .create();
+
+ Group normalArgs = new GroupBuilder()
+ .withOption(help)
+ .withOption(quiet)
+ .withOption(auc)
+ .withOption(scores)
+ .withOption(confusion)
+ .withOption(inputFileOption)
+ .withOption(modelFileOption)
+ .create();
+
+ Parser parser = new Parser();
+ parser.setHelpOption(help);
+ parser.setHelpTrigger("--help");
+ parser.setGroup(normalArgs);
+ parser.setHelpFormatter(new HelpFormatter(" ", "", " ", 130));
+ CommandLine cmdLine = parser.parseAndHelp(args);
+
+ if (cmdLine == null) {
+ return false;
+ }
+
+ inputFile = getStringArgument(cmdLine, inputFileOption);
+ modelFile = getStringArgument(cmdLine, modelFileOption);
+ showAuc = getBooleanArgument(cmdLine, auc);
+ showScores = getBooleanArgument(cmdLine, scores);
+ showConfusion = getBooleanArgument(cmdLine, confusion);
+
+ return true;
+ }
+
+ private static boolean getBooleanArgument(CommandLine cmdLine, Option option) {
+ return cmdLine.hasOption(option);
+ }
+
+ private static String getStringArgument(CommandLine cmdLine, Option inputFile) {
+ return (String) cmdLine.getValue(inputFile);
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/SGDHelper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/SGDHelper.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/SGDHelper.java
new file mode 100644
index 0000000..c657803
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/SGDHelper.java
@@ -0,0 +1,151 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import com.google.common.collect.Multiset;
+import org.apache.mahout.classifier.NewsgroupHelper;
+import org.apache.mahout.ep.State;
+import org.apache.mahout.math.Matrix;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.function.DoubleFunction;
+import org.apache.mahout.math.function.Functions;
+import org.apache.mahout.vectorizer.encoders.Dictionary;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
+import java.util.Set;
+import java.util.TreeMap;
+
+public final class SGDHelper {
+
+ private static final String[] LEAK_LABELS = {"none", "month-year", "day-month-year"};
+
+ private SGDHelper() {
+ }
+
+ public static void dissect(int leakType,
+ Dictionary dictionary,
+ AdaptiveLogisticRegression learningAlgorithm,
+ Iterable<File> files, Multiset<String> overallCounts) throws IOException {
+ CrossFoldLearner model = learningAlgorithm.getBest().getPayload().getLearner();
+ model.close();
+
+ Map<String, Set<Integer>> traceDictionary = new TreeMap<>();
+ ModelDissector md = new ModelDissector();
+
+ NewsgroupHelper helper = new NewsgroupHelper();
+ helper.getEncoder().setTraceDictionary(traceDictionary);
+ helper.getBias().setTraceDictionary(traceDictionary);
+
+ for (File file : permute(files, helper.getRandom()).subList(0, 500)) {
+ String ng = file.getParentFile().getName();
+ int actual = dictionary.intern(ng);
+
+ traceDictionary.clear();
+ Vector v = helper.encodeFeatureVector(file, actual, leakType, overallCounts);
+ md.update(v, traceDictionary, model);
+ }
+
+ List<String> ngNames = new ArrayList<>(dictionary.values());
+ List<ModelDissector.Weight> weights = md.summary(100);
+ System.out.println("============");
+ System.out.println("Model Dissection");
+ for (ModelDissector.Weight w : weights) {
+ System.out.printf("%s\t%.1f\t%s\t%.1f\t%s\t%.1f\t%s%n",
+ w.getFeature(), w.getWeight(), ngNames.get(w.getMaxImpact() + 1),
+ w.getCategory(1), w.getWeight(1), w.getCategory(2), w.getWeight(2));
+ }
+ }
+
+ public static List<File> permute(Iterable<File> files, Random rand) {
+ List<File> r = new ArrayList<>();
+ for (File file : files) {
+ int i = rand.nextInt(r.size() + 1);
+ if (i == r.size()) {
+ r.add(file);
+ } else {
+ r.add(r.get(i));
+ r.set(i, file);
+ }
+ }
+ return r;
+ }
+
+ static void analyzeState(SGDInfo info, int leakType, int k, State<AdaptiveLogisticRegression.Wrapper,
+ CrossFoldLearner> best) throws IOException {
+ int bump = info.getBumps()[(int) Math.floor(info.getStep()) % info.getBumps().length];
+ int scale = (int) Math.pow(10, Math.floor(info.getStep() / info.getBumps().length));
+ double maxBeta;
+ double nonZeros;
+ double positive;
+ double norm;
+
+ double lambda = 0;
+ double mu = 0;
+
+ if (best != null) {
+ CrossFoldLearner state = best.getPayload().getLearner();
+ info.setAverageCorrect(state.percentCorrect());
+ info.setAverageLL(state.logLikelihood());
+
+ OnlineLogisticRegression model = state.getModels().get(0);
+ // finish off pending regularization
+ model.close();
+
+ Matrix beta = model.getBeta();
+ maxBeta = beta.aggregate(Functions.MAX, Functions.ABS);
+ nonZeros = beta.aggregate(Functions.PLUS, new DoubleFunction() {
+ @Override
+ public double apply(double v) {
+ return Math.abs(v) > 1.0e-6 ? 1 : 0;
+ }
+ });
+ positive = beta.aggregate(Functions.PLUS, new DoubleFunction() {
+ @Override
+ public double apply(double v) {
+ return v > 0 ? 1 : 0;
+ }
+ });
+ norm = beta.aggregate(Functions.PLUS, Functions.ABS);
+
+ lambda = best.getMappedParams()[0];
+ mu = best.getMappedParams()[1];
+ } else {
+ maxBeta = 0;
+ nonZeros = 0;
+ positive = 0;
+ norm = 0;
+ }
+ if (k % (bump * scale) == 0) {
+ if (best != null) {
+ File modelFile = new File(System.getProperty("java.io.tmpdir"), "news-group-" + k + ".model");
+ ModelSerializer.writeBinary(modelFile.getAbsolutePath(), best.getPayload().getLearner().getModels().get(0));
+ }
+
+ info.setStep(info.getStep() + 0.25);
+ System.out.printf("%.2f\t%.2f\t%.2f\t%.2f\t%.8g\t%.8g\t", maxBeta, nonZeros, positive, norm, lambda, mu);
+ System.out.printf("%d\t%.3f\t%.2f\t%s%n",
+ k, info.getAverageLL(), info.getAverageCorrect() * 100, LEAK_LABELS[leakType % 3]);
+ }
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/SGDInfo.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/SGDInfo.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/SGDInfo.java
new file mode 100644
index 0000000..be55d43
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/SGDInfo.java
@@ -0,0 +1,59 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+final class SGDInfo {
+
+ private double averageLL;
+ private double averageCorrect;
+ private double step;
+ private int[] bumps = {1, 2, 5};
+
+ double getAverageLL() {
+ return averageLL;
+ }
+
+ void setAverageLL(double averageLL) {
+ this.averageLL = averageLL;
+ }
+
+ double getAverageCorrect() {
+ return averageCorrect;
+ }
+
+ void setAverageCorrect(double averageCorrect) {
+ this.averageCorrect = averageCorrect;
+ }
+
+ double getStep() {
+ return step;
+ }
+
+ void setStep(double step) {
+ this.step = step;
+ }
+
+ int[] getBumps() {
+ return bumps;
+ }
+
+ void setBumps(int[] bumps) {
+ this.bumps = bumps;
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/SimpleCsvExamples.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/SimpleCsvExamples.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/SimpleCsvExamples.java
new file mode 100644
index 0000000..b3da452
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/SimpleCsvExamples.java
@@ -0,0 +1,283 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import com.google.common.base.Joiner;
+import com.google.common.base.Splitter;
+import com.google.common.collect.Lists;
+import com.google.common.io.Closeables;
+import com.google.common.io.Files;
+import org.apache.commons.io.Charsets;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.list.IntArrayList;
+import org.apache.mahout.math.stats.OnlineSummarizer;
+import org.apache.mahout.vectorizer.encoders.ConstantValueEncoder;
+import org.apache.mahout.vectorizer.encoders.FeatureVectorEncoder;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.BufferedReader;
+import java.io.Closeable;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStreamWriter;
+import java.io.PrintWriter;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Random;
+
+/**
+ * Shows how different encoding choices can make big speed differences.
+ * <p/>
+ * Run with command line options --generate 1000000 test.csv to generate a million data lines in
+ * test.csv.
+ * <p/>
+ * Run with command line options --parser test.csv to time how long it takes to parse and encode
+ * those million data points
+ * <p/>
+ * Run with command line options --fast test.csv to time how long it takes to parse and encode those
+ * million data points using byte-level parsing and direct value encoding.
+ * <p/>
+ * This doesn't demonstrate text encoding which is subject to somewhat different tricks. The basic
+ * idea of caching hash locations and byte level parsing still very much applies to text, however.
+ */
+public final class SimpleCsvExamples {
+
+ public static final char SEPARATOR_CHAR = '\t';
+ private static final int FIELDS = 100;
+
+ private static final Logger log = LoggerFactory.getLogger(SimpleCsvExamples.class);
+
+ private SimpleCsvExamples() {}
+
+ public static void main(String[] args) throws IOException {
+ FeatureVectorEncoder[] encoder = new FeatureVectorEncoder[FIELDS];
+ for (int i = 0; i < FIELDS; i++) {
+ encoder[i] = new ConstantValueEncoder("v" + 1);
+ }
+
+ OnlineSummarizer[] s = new OnlineSummarizer[FIELDS];
+ for (int i = 0; i < FIELDS; i++) {
+ s[i] = new OnlineSummarizer();
+ }
+ long t0 = System.currentTimeMillis();
+ Vector v = new DenseVector(1000);
+ if ("--generate".equals(args[0])) {
+ try (PrintWriter out =
+ new PrintWriter(new OutputStreamWriter(new FileOutputStream(new File(args[2])), Charsets.UTF_8))) {
+ int n = Integer.parseInt(args[1]);
+ for (int i = 0; i < n; i++) {
+ Line x = Line.generate();
+ out.println(x);
+ }
+ }
+ } else if ("--parse".equals(args[0])) {
+ try (BufferedReader in = Files.newReader(new File(args[1]), Charsets.UTF_8)){
+ String line = in.readLine();
+ while (line != null) {
+ v.assign(0);
+ Line x = new Line(line);
+ for (int i = 0; i < FIELDS; i++) {
+ s[i].add(x.getDouble(i));
+ encoder[i].addToVector(x.get(i), v);
+ }
+ line = in.readLine();
+ }
+ }
+ String separator = "";
+ for (int i = 0; i < FIELDS; i++) {
+ System.out.printf("%s%.3f", separator, s[i].getMean());
+ separator = ",";
+ }
+ } else if ("--fast".equals(args[0])) {
+ try (FastLineReader in = new FastLineReader(new FileInputStream(args[1]))){
+ FastLine line = in.read();
+ while (line != null) {
+ v.assign(0);
+ for (int i = 0; i < FIELDS; i++) {
+ double z = line.getDouble(i);
+ s[i].add(z);
+ encoder[i].addToVector((byte[]) null, z, v);
+ }
+ line = in.read();
+ }
+ }
+
+ String separator = "";
+ for (int i = 0; i < FIELDS; i++) {
+ System.out.printf("%s%.3f", separator, s[i].getMean());
+ separator = ",";
+ }
+ }
+ System.out.printf("\nElapsed time = %.3f%n", (System.currentTimeMillis() - t0) / 1000.0);
+ }
+
+
+ private static final class Line {
+ private static final Splitter ON_TABS = Splitter.on(SEPARATOR_CHAR).trimResults();
+ public static final Joiner WITH_COMMAS = Joiner.on(SEPARATOR_CHAR);
+
+ public static final Random RAND = RandomUtils.getRandom();
+
+ private final List<String> data;
+
+ private Line(CharSequence line) {
+ data = Lists.newArrayList(ON_TABS.split(line));
+ }
+
+ private Line() {
+ data = new ArrayList<>();
+ }
+
+ public double getDouble(int field) {
+ return Double.parseDouble(data.get(field));
+ }
+
+ /**
+ * Generate a random line with 20 fields each with integer values.
+ *
+ * @return A new line with data.
+ */
+ public static Line generate() {
+ Line r = new Line();
+ for (int i = 0; i < FIELDS; i++) {
+ double mean = ((i + 1) * 257) % 50 + 1;
+ r.data.add(Integer.toString(randomValue(mean)));
+ }
+ return r;
+ }
+
+ /**
+ * Returns a random exponentially distributed integer with a particular mean value. This is
+ * just a way to create more small numbers than big numbers.
+ *
+ * @param mean mean of the distribution
+ * @return random exponentially distributed integer with the specific mean
+ */
+ private static int randomValue(double mean) {
+ return (int) (-mean * Math.log1p(-RAND.nextDouble()));
+ }
+
+ @Override
+ public String toString() {
+ return WITH_COMMAS.join(data);
+ }
+
+ public String get(int field) {
+ return data.get(field);
+ }
+ }
+
+ private static final class FastLine {
+
+ private final ByteBuffer base;
+ private final IntArrayList start = new IntArrayList();
+ private final IntArrayList length = new IntArrayList();
+
+ private FastLine(ByteBuffer base) {
+ this.base = base;
+ }
+
+ public static FastLine read(ByteBuffer buf) {
+ FastLine r = new FastLine(buf);
+ r.start.add(buf.position());
+ int offset = buf.position();
+ while (offset < buf.limit()) {
+ int ch = buf.get();
+ offset = buf.position();
+ switch (ch) {
+ case '\n':
+ r.length.add(offset - r.start.get(r.length.size()) - 1);
+ return r;
+ case SEPARATOR_CHAR:
+ r.length.add(offset - r.start.get(r.length.size()) - 1);
+ r.start.add(offset);
+ break;
+ default:
+ // nothing to do for now
+ }
+ }
+ throw new IllegalArgumentException("Not enough bytes in buffer");
+ }
+
+ public double getDouble(int field) {
+ int offset = start.get(field);
+ int size = length.get(field);
+ switch (size) {
+ case 1:
+ return base.get(offset) - '0';
+ case 2:
+ return (base.get(offset) - '0') * 10 + base.get(offset + 1) - '0';
+ default:
+ double r = 0;
+ for (int i = 0; i < size; i++) {
+ r = 10 * r + base.get(offset + i) - '0';
+ }
+ return r;
+ }
+ }
+ }
+
+ private static final class FastLineReader implements Closeable {
+ private final InputStream in;
+ private final ByteBuffer buf = ByteBuffer.allocate(100000);
+
+ private FastLineReader(InputStream in) throws IOException {
+ this.in = in;
+ buf.limit(0);
+ fillBuffer();
+ }
+
+ public FastLine read() throws IOException {
+ fillBuffer();
+ if (buf.remaining() > 0) {
+ return FastLine.read(buf);
+ } else {
+ return null;
+ }
+ }
+
+ private void fillBuffer() throws IOException {
+ if (buf.remaining() < 10000) {
+ buf.compact();
+ int n = in.read(buf.array(), buf.position(), buf.remaining());
+ if (n == -1) {
+ buf.flip();
+ } else {
+ buf.limit(buf.position() + n);
+ buf.position(0);
+ }
+ }
+ }
+
+ @Override
+ public void close() {
+ try {
+ Closeables.close(in, true);
+ } catch (IOException e) {
+ log.error(e.getMessage(), e);
+ }
+ }
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/TestASFEmail.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/TestASFEmail.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/TestASFEmail.java
new file mode 100644
index 0000000..074f774
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/TestASFEmail.java
@@ -0,0 +1,152 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.commandline.Parser;
+import org.apache.commons.cli2.util.HelpFormatter;
+import org.apache.commons.io.Charsets;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.PathFilter;
+import org.apache.hadoop.io.Text;
+import org.apache.mahout.classifier.ClassifierResult;
+import org.apache.mahout.classifier.ResultAnalyzer;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterator;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+import org.apache.mahout.vectorizer.encoders.Dictionary;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.PrintWriter;
+
+/**
+ * Run the ASF email, as trained by TrainASFEmail
+ */
+public final class TestASFEmail {
+
+ private String inputFile;
+ private String modelFile;
+
+ private TestASFEmail() {}
+
+ public static void main(String[] args) throws IOException {
+ TestASFEmail runner = new TestASFEmail();
+ if (runner.parseArgs(args)) {
+ runner.run(new PrintWriter(new OutputStreamWriter(System.out, Charsets.UTF_8), true));
+ }
+ }
+
+ public void run(PrintWriter output) throws IOException {
+
+ File base = new File(inputFile);
+ //contains the best model
+ OnlineLogisticRegression classifier =
+ ModelSerializer.readBinary(new FileInputStream(modelFile), OnlineLogisticRegression.class);
+
+
+ Dictionary asfDictionary = new Dictionary();
+ Configuration conf = new Configuration();
+ PathFilter testFilter = new PathFilter() {
+ @Override
+ public boolean accept(Path path) {
+ return path.getName().contains("test");
+ }
+ };
+ SequenceFileDirIterator<Text, VectorWritable> iter =
+ new SequenceFileDirIterator<>(new Path(base.toString()), PathType.LIST, testFilter,
+ null, true, conf);
+
+ long numItems = 0;
+ while (iter.hasNext()) {
+ Pair<Text, VectorWritable> next = iter.next();
+ asfDictionary.intern(next.getFirst().toString());
+ numItems++;
+ }
+
+ System.out.println(numItems + " test files");
+ ResultAnalyzer ra = new ResultAnalyzer(asfDictionary.values(), "DEFAULT");
+ iter = new SequenceFileDirIterator<>(new Path(base.toString()), PathType.LIST, testFilter,
+ null, true, conf);
+ while (iter.hasNext()) {
+ Pair<Text, VectorWritable> next = iter.next();
+ String ng = next.getFirst().toString();
+
+ int actual = asfDictionary.intern(ng);
+ Vector result = classifier.classifyFull(next.getSecond().get());
+ int cat = result.maxValueIndex();
+ double score = result.maxValue();
+ double ll = classifier.logLikelihood(actual, next.getSecond().get());
+ ClassifierResult cr = new ClassifierResult(asfDictionary.values().get(cat), score, ll);
+ ra.addInstance(asfDictionary.values().get(actual), cr);
+
+ }
+ output.println(ra);
+ }
+
+ boolean parseArgs(String[] args) {
+ DefaultOptionBuilder builder = new DefaultOptionBuilder();
+
+ Option help = builder.withLongName("help").withDescription("print this list").create();
+
+ ArgumentBuilder argumentBuilder = new ArgumentBuilder();
+ Option inputFileOption = builder.withLongName("input")
+ .withRequired(true)
+ .withArgument(argumentBuilder.withName("input").withMaximum(1).create())
+ .withDescription("where to get training data")
+ .create();
+
+ Option modelFileOption = builder.withLongName("model")
+ .withRequired(true)
+ .withArgument(argumentBuilder.withName("model").withMaximum(1).create())
+ .withDescription("where to get a model")
+ .create();
+
+ Group normalArgs = new GroupBuilder()
+ .withOption(help)
+ .withOption(inputFileOption)
+ .withOption(modelFileOption)
+ .create();
+
+ Parser parser = new Parser();
+ parser.setHelpOption(help);
+ parser.setHelpTrigger("--help");
+ parser.setGroup(normalArgs);
+ parser.setHelpFormatter(new HelpFormatter(" ", "", " ", 130));
+ CommandLine cmdLine = parser.parseAndHelp(args);
+
+ if (cmdLine == null) {
+ return false;
+ }
+
+ inputFile = (String) cmdLine.getValue(inputFileOption);
+ modelFile = (String) cmdLine.getValue(modelFileOption);
+ return true;
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/TestNewsGroups.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/TestNewsGroups.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/TestNewsGroups.java
new file mode 100644
index 0000000..f0316e9
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/TestNewsGroups.java
@@ -0,0 +1,141 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import com.google.common.collect.HashMultiset;
+import com.google.common.collect.Multiset;
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.commandline.Parser;
+import org.apache.commons.cli2.util.HelpFormatter;
+import org.apache.commons.io.Charsets;
+import org.apache.mahout.classifier.ClassifierResult;
+import org.apache.mahout.classifier.NewsgroupHelper;
+import org.apache.mahout.classifier.ResultAnalyzer;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.vectorizer.encoders.Dictionary;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.PrintWriter;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * Run the 20 news groups test data through SGD, as trained by {@link org.apache.mahout.classifier.sgd.TrainNewsGroups}.
+ */
+public final class TestNewsGroups {
+
+ private String inputFile;
+ private String modelFile;
+
+ private TestNewsGroups() {
+ }
+
+ public static void main(String[] args) throws IOException {
+ TestNewsGroups runner = new TestNewsGroups();
+ if (runner.parseArgs(args)) {
+ runner.run(new PrintWriter(new OutputStreamWriter(System.out, Charsets.UTF_8), true));
+ }
+ }
+
+ public void run(PrintWriter output) throws IOException {
+
+ File base = new File(inputFile);
+ //contains the best model
+ OnlineLogisticRegression classifier =
+ ModelSerializer.readBinary(new FileInputStream(modelFile), OnlineLogisticRegression.class);
+
+ Dictionary newsGroups = new Dictionary();
+ Multiset<String> overallCounts = HashMultiset.create();
+
+ List<File> files = new ArrayList<>();
+ for (File newsgroup : base.listFiles()) {
+ if (newsgroup.isDirectory()) {
+ newsGroups.intern(newsgroup.getName());
+ files.addAll(Arrays.asList(newsgroup.listFiles()));
+ }
+ }
+ System.out.println(files.size() + " test files");
+ ResultAnalyzer ra = new ResultAnalyzer(newsGroups.values(), "DEFAULT");
+ for (File file : files) {
+ String ng = file.getParentFile().getName();
+
+ int actual = newsGroups.intern(ng);
+ NewsgroupHelper helper = new NewsgroupHelper();
+ //no leak type ensures this is a normal vector
+ Vector input = helper.encodeFeatureVector(file, actual, 0, overallCounts);
+ Vector result = classifier.classifyFull(input);
+ int cat = result.maxValueIndex();
+ double score = result.maxValue();
+ double ll = classifier.logLikelihood(actual, input);
+ ClassifierResult cr = new ClassifierResult(newsGroups.values().get(cat), score, ll);
+ ra.addInstance(newsGroups.values().get(actual), cr);
+
+ }
+ output.println(ra);
+ }
+
+ boolean parseArgs(String[] args) {
+ DefaultOptionBuilder builder = new DefaultOptionBuilder();
+
+ Option help = builder.withLongName("help").withDescription("print this list").create();
+
+ ArgumentBuilder argumentBuilder = new ArgumentBuilder();
+ Option inputFileOption = builder.withLongName("input")
+ .withRequired(true)
+ .withArgument(argumentBuilder.withName("input").withMaximum(1).create())
+ .withDescription("where to get training data")
+ .create();
+
+ Option modelFileOption = builder.withLongName("model")
+ .withRequired(true)
+ .withArgument(argumentBuilder.withName("model").withMaximum(1).create())
+ .withDescription("where to get a model")
+ .create();
+
+ Group normalArgs = new GroupBuilder()
+ .withOption(help)
+ .withOption(inputFileOption)
+ .withOption(modelFileOption)
+ .create();
+
+ Parser parser = new Parser();
+ parser.setHelpOption(help);
+ parser.setHelpTrigger("--help");
+ parser.setGroup(normalArgs);
+ parser.setHelpFormatter(new HelpFormatter(" ", "", " ", 130));
+ CommandLine cmdLine = parser.parseAndHelp(args);
+
+ if (cmdLine == null) {
+ return false;
+ }
+
+ inputFile = (String) cmdLine.getValue(inputFileOption);
+ modelFile = (String) cmdLine.getValue(modelFileOption);
+ return true;
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/TrainASFEmail.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/TrainASFEmail.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/TrainASFEmail.java
new file mode 100644
index 0000000..e681f92
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/TrainASFEmail.java
@@ -0,0 +1,137 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import com.google.common.collect.HashMultiset;
+import com.google.common.collect.Multiset;
+import com.google.common.collect.Ordering;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.PathFilter;
+import org.apache.hadoop.io.Text;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterator;
+import org.apache.mahout.ep.State;
+import org.apache.mahout.math.VectorWritable;
+import org.apache.mahout.vectorizer.encoders.Dictionary;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+
+public final class TrainASFEmail extends AbstractJob {
+
+ private TrainASFEmail() {
+ }
+
+ @Override
+ public int run(String[] args) throws Exception {
+ addInputOption();
+ addOutputOption();
+ addOption("categories", "nc", "The number of categories to train on", true);
+ addOption("cardinality", "c", "The size of the vectors to use", "100000");
+ addOption("threads", "t", "The number of threads to use in the learner", "20");
+ addOption("poolSize", "p", "The number of CrossFoldLearners to use in the AdaptiveLogisticRegression. "
+ + "Higher values require more memory.", "5");
+ if (parseArguments(args) == null) {
+ return -1;
+ }
+
+ File base = new File(getInputPath().toString());
+
+ Multiset<String> overallCounts = HashMultiset.create();
+ File output = new File(getOutputPath().toString());
+ output.mkdirs();
+ int numCats = Integer.parseInt(getOption("categories"));
+ int cardinality = Integer.parseInt(getOption("cardinality", "100000"));
+ int threadCount = Integer.parseInt(getOption("threads", "20"));
+ int poolSize = Integer.parseInt(getOption("poolSize", "5"));
+ Dictionary asfDictionary = new Dictionary();
+ AdaptiveLogisticRegression learningAlgorithm =
+ new AdaptiveLogisticRegression(numCats, cardinality, new L1(), threadCount, poolSize);
+ learningAlgorithm.setInterval(800);
+ learningAlgorithm.setAveragingWindow(500);
+
+ //We ran seq2encoded and split input already, so let's just build up the dictionary
+ Configuration conf = new Configuration();
+ PathFilter trainFilter = new PathFilter() {
+ @Override
+ public boolean accept(Path path) {
+ return path.getName().contains("training");
+ }
+ };
+ SequenceFileDirIterator<Text, VectorWritable> iter =
+ new SequenceFileDirIterator<>(new Path(base.toString()), PathType.LIST, trainFilter, null, true, conf);
+ long numItems = 0;
+ while (iter.hasNext()) {
+ Pair<Text, VectorWritable> next = iter.next();
+ asfDictionary.intern(next.getFirst().toString());
+ numItems++;
+ }
+
+ System.out.println(numItems + " training files");
+
+ SGDInfo info = new SGDInfo();
+
+ iter = new SequenceFileDirIterator<>(new Path(base.toString()), PathType.LIST, trainFilter,
+ null, true, conf);
+ int k = 0;
+ while (iter.hasNext()) {
+ Pair<Text, VectorWritable> next = iter.next();
+ String ng = next.getFirst().toString();
+ int actual = asfDictionary.intern(ng);
+ //we already have encoded
+ learningAlgorithm.train(actual, next.getSecond().get());
+ k++;
+ State<AdaptiveLogisticRegression.Wrapper, CrossFoldLearner> best = learningAlgorithm.getBest();
+
+ SGDHelper.analyzeState(info, 0, k, best);
+ }
+ learningAlgorithm.close();
+ //TODO: how to dissection since we aren't processing the files here
+ //SGDHelper.dissect(leakType, asfDictionary, learningAlgorithm, files, overallCounts);
+ System.out.println("exiting main, writing model to " + output);
+
+ ModelSerializer.writeBinary(output + "/asf.model",
+ learningAlgorithm.getBest().getPayload().getLearner().getModels().get(0));
+
+ List<Integer> counts = new ArrayList<>();
+ System.out.println("Word counts");
+ for (String count : overallCounts.elementSet()) {
+ counts.add(overallCounts.count(count));
+ }
+ Collections.sort(counts, Ordering.natural().reverse());
+ k = 0;
+ for (Integer count : counts) {
+ System.out.println(k + "\t" + count);
+ k++;
+ if (k > 1000) {
+ break;
+ }
+ }
+ return 0;
+ }
+
+ public static void main(String[] args) throws Exception {
+ TrainASFEmail trainer = new TrainASFEmail();
+ trainer.run(args);
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/TrainAdaptiveLogistic.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/TrainAdaptiveLogistic.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/TrainAdaptiveLogistic.java
new file mode 100644
index 0000000..defb5b9
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/TrainAdaptiveLogistic.java
@@ -0,0 +1,377 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import com.google.common.io.Resources;
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.commandline.Parser;
+import org.apache.commons.cli2.util.HelpFormatter;
+import org.apache.commons.io.Charsets;
+import org.apache.mahout.classifier.sgd.AdaptiveLogisticRegression.Wrapper;
+import org.apache.mahout.ep.State;
+import org.apache.mahout.math.RandomAccessSparseVector;
+import org.apache.mahout.math.Vector;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.OutputStream;
+import java.io.OutputStreamWriter;
+import java.io.PrintWriter;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Locale;
+
+public final class TrainAdaptiveLogistic {
+
+ private static String inputFile;
+ private static String outputFile;
+ private static AdaptiveLogisticModelParameters lmp;
+ private static int passes;
+ private static boolean showperf;
+ private static int skipperfnum = 99;
+ private static AdaptiveLogisticRegression model;
+
+ private TrainAdaptiveLogistic() {
+ }
+
+ public static void main(String[] args) throws Exception {
+ mainToOutput(args, new PrintWriter(new OutputStreamWriter(System.out, Charsets.UTF_8), true));
+ }
+
+ static void mainToOutput(String[] args, PrintWriter output) throws Exception {
+ if (parseArgs(args)) {
+
+ CsvRecordFactory csv = lmp.getCsvRecordFactory();
+ model = lmp.createAdaptiveLogisticRegression();
+ State<Wrapper, CrossFoldLearner> best;
+ CrossFoldLearner learner = null;
+
+ int k = 0;
+ for (int pass = 0; pass < passes; pass++) {
+ BufferedReader in = open(inputFile);
+
+ // read variable names
+ csv.firstLine(in.readLine());
+
+ String line = in.readLine();
+ while (line != null) {
+ // for each new line, get target and predictors
+ Vector input = new RandomAccessSparseVector(lmp.getNumFeatures());
+ int targetValue = csv.processLine(line, input);
+
+ // update model
+ model.train(targetValue, input);
+ k++;
+
+ if (showperf && (k % (skipperfnum + 1) == 0)) {
+
+ best = model.getBest();
+ if (best != null) {
+ learner = best.getPayload().getLearner();
+ }
+ if (learner != null) {
+ double averageCorrect = learner.percentCorrect();
+ double averageLL = learner.logLikelihood();
+ output.printf("%d\t%.3f\t%.2f%n",
+ k, averageLL, averageCorrect * 100);
+ } else {
+ output.printf(Locale.ENGLISH,
+ "%10d %2d %s%n", k, targetValue,
+ "AdaptiveLogisticRegression has not found a good model ......");
+ }
+ }
+ line = in.readLine();
+ }
+ in.close();
+ }
+
+ best = model.getBest();
+ if (best != null) {
+ learner = best.getPayload().getLearner();
+ }
+ if (learner == null) {
+ output.println("AdaptiveLogisticRegression has failed to train a model.");
+ return;
+ }
+
+ try (OutputStream modelOutput = new FileOutputStream(outputFile)) {
+ lmp.saveTo(modelOutput);
+ }
+
+ OnlineLogisticRegression lr = learner.getModels().get(0);
+ output.println(lmp.getNumFeatures());
+ output.println(lmp.getTargetVariable() + " ~ ");
+ String sep = "";
+ for (String v : csv.getTraceDictionary().keySet()) {
+ double weight = predictorWeight(lr, 0, csv, v);
+ if (weight != 0) {
+ output.printf(Locale.ENGLISH, "%s%.3f*%s", sep, weight, v);
+ sep = " + ";
+ }
+ }
+ output.printf("%n");
+
+ for (int row = 0; row < lr.getBeta().numRows(); row++) {
+ for (String key : csv.getTraceDictionary().keySet()) {
+ double weight = predictorWeight(lr, row, csv, key);
+ if (weight != 0) {
+ output.printf(Locale.ENGLISH, "%20s %.5f%n", key, weight);
+ }
+ }
+ for (int column = 0; column < lr.getBeta().numCols(); column++) {
+ output.printf(Locale.ENGLISH, "%15.9f ", lr.getBeta().get(row, column));
+ }
+ output.println();
+ }
+ }
+
+ }
+
+ private static double predictorWeight(OnlineLogisticRegression lr, int row, RecordFactory csv, String predictor) {
+ double weight = 0;
+ for (Integer column : csv.getTraceDictionary().get(predictor)) {
+ weight += lr.getBeta().get(row, column);
+ }
+ return weight;
+ }
+
+ private static boolean parseArgs(String[] args) {
+ DefaultOptionBuilder builder = new DefaultOptionBuilder();
+
+ Option help = builder.withLongName("help")
+ .withDescription("print this list").create();
+
+ Option quiet = builder.withLongName("quiet")
+ .withDescription("be extra quiet").create();
+
+
+ ArgumentBuilder argumentBuilder = new ArgumentBuilder();
+ Option showperf = builder
+ .withLongName("showperf")
+ .withDescription("output performance measures during training")
+ .create();
+
+ Option inputFile = builder
+ .withLongName("input")
+ .withRequired(true)
+ .withArgument(
+ argumentBuilder.withName("input").withMaximum(1)
+ .create())
+ .withDescription("where to get training data").create();
+
+ Option outputFile = builder
+ .withLongName("output")
+ .withRequired(true)
+ .withArgument(
+ argumentBuilder.withName("output").withMaximum(1)
+ .create())
+ .withDescription("where to write the model content").create();
+
+ Option threads = builder.withLongName("threads")
+ .withArgument(
+ argumentBuilder.withName("threads").withDefault("4").create())
+ .withDescription("the number of threads AdaptiveLogisticRegression uses")
+ .create();
+
+
+ Option predictors = builder.withLongName("predictors")
+ .withRequired(true)
+ .withArgument(argumentBuilder.withName("predictors").create())
+ .withDescription("a list of predictor variables").create();
+
+ Option types = builder
+ .withLongName("types")
+ .withRequired(true)
+ .withArgument(argumentBuilder.withName("types").create())
+ .withDescription(
+ "a list of predictor variable types (numeric, word, or text)")
+ .create();
+
+ Option target = builder
+ .withLongName("target")
+ .withDescription("the name of the target variable")
+ .withRequired(true)
+ .withArgument(
+ argumentBuilder.withName("target").withMaximum(1)
+ .create())
+ .create();
+
+ Option targetCategories = builder
+ .withLongName("categories")
+ .withDescription("the number of target categories to be considered")
+ .withRequired(true)
+ .withArgument(argumentBuilder.withName("categories").withMaximum(1).create())
+ .create();
+
+
+ Option features = builder
+ .withLongName("features")
+ .withDescription("the number of internal hashed features to use")
+ .withArgument(
+ argumentBuilder.withName("numFeatures")
+ .withDefault("1000").withMaximum(1).create())
+ .create();
+
+ Option passes = builder
+ .withLongName("passes")
+ .withDescription("the number of times to pass over the input data")
+ .withArgument(
+ argumentBuilder.withName("passes").withDefault("2")
+ .withMaximum(1).create())
+ .create();
+
+ Option interval = builder.withLongName("interval")
+ .withArgument(
+ argumentBuilder.withName("interval").withDefault("500").create())
+ .withDescription("the interval property of AdaptiveLogisticRegression")
+ .create();
+
+ Option window = builder.withLongName("window")
+ .withArgument(
+ argumentBuilder.withName("window").withDefault("800").create())
+ .withDescription("the average propery of AdaptiveLogisticRegression")
+ .create();
+
+ Option skipperfnum = builder.withLongName("skipperfnum")
+ .withArgument(
+ argumentBuilder.withName("skipperfnum").withDefault("99").create())
+ .withDescription("show performance measures every (skipperfnum + 1) rows")
+ .create();
+
+ Option prior = builder.withLongName("prior")
+ .withArgument(
+ argumentBuilder.withName("prior").withDefault("L1").create())
+ .withDescription("the prior algorithm to use: L1, L2, ebp, tp, up")
+ .create();
+
+ Option priorOption = builder.withLongName("prioroption")
+ .withArgument(
+ argumentBuilder.withName("prioroption").create())
+ .withDescription("constructor parameter for ElasticBandPrior and TPrior")
+ .create();
+
+ Option auc = builder.withLongName("auc")
+ .withArgument(
+ argumentBuilder.withName("auc").withDefault("global").create())
+ .withDescription("the auc to use: global or grouped")
+ .create();
+
+
+
+ Group normalArgs = new GroupBuilder().withOption(help)
+ .withOption(quiet).withOption(inputFile).withOption(outputFile)
+ .withOption(target).withOption(targetCategories)
+ .withOption(predictors).withOption(types).withOption(passes)
+ .withOption(interval).withOption(window).withOption(threads)
+ .withOption(prior).withOption(features).withOption(showperf)
+ .withOption(skipperfnum).withOption(priorOption).withOption(auc)
+ .create();
+
+ Parser parser = new Parser();
+ parser.setHelpOption(help);
+ parser.setHelpTrigger("--help");
+ parser.setGroup(normalArgs);
+ parser.setHelpFormatter(new HelpFormatter(" ", "", " ", 130));
+ CommandLine cmdLine = parser.parseAndHelp(args);
+
+ if (cmdLine == null) {
+ return false;
+ }
+
+ TrainAdaptiveLogistic.inputFile = getStringArgument(cmdLine, inputFile);
+ TrainAdaptiveLogistic.outputFile = getStringArgument(cmdLine,
+ outputFile);
+
+ List<String> typeList = new ArrayList<>();
+ for (Object x : cmdLine.getValues(types)) {
+ typeList.add(x.toString());
+ }
+
+ List<String> predictorList = new ArrayList<>();
+ for (Object x : cmdLine.getValues(predictors)) {
+ predictorList.add(x.toString());
+ }
+
+ lmp = new AdaptiveLogisticModelParameters();
+ lmp.setTargetVariable(getStringArgument(cmdLine, target));
+ lmp.setMaxTargetCategories(getIntegerArgument(cmdLine, targetCategories));
+ lmp.setNumFeatures(getIntegerArgument(cmdLine, features));
+ lmp.setInterval(getIntegerArgument(cmdLine, interval));
+ lmp.setAverageWindow(getIntegerArgument(cmdLine, window));
+ lmp.setThreads(getIntegerArgument(cmdLine, threads));
+ lmp.setAuc(getStringArgument(cmdLine, auc));
+ lmp.setPrior(getStringArgument(cmdLine, prior));
+ if (cmdLine.getValue(priorOption) != null) {
+ lmp.setPriorOption(getDoubleArgument(cmdLine, priorOption));
+ }
+ lmp.setTypeMap(predictorList, typeList);
+ TrainAdaptiveLogistic.showperf = getBooleanArgument(cmdLine, showperf);
+ TrainAdaptiveLogistic.skipperfnum = getIntegerArgument(cmdLine, skipperfnum);
+ TrainAdaptiveLogistic.passes = getIntegerArgument(cmdLine, passes);
+
+ lmp.checkParameters();
+
+ return true;
+ }
+
+ private static String getStringArgument(CommandLine cmdLine,
+ Option inputFile) {
+ return (String) cmdLine.getValue(inputFile);
+ }
+
+ private static boolean getBooleanArgument(CommandLine cmdLine, Option option) {
+ return cmdLine.hasOption(option);
+ }
+
+ private static int getIntegerArgument(CommandLine cmdLine, Option features) {
+ return Integer.parseInt((String) cmdLine.getValue(features));
+ }
+
+ private static double getDoubleArgument(CommandLine cmdLine, Option op) {
+ return Double.parseDouble((String) cmdLine.getValue(op));
+ }
+
+ public static AdaptiveLogisticRegression getModel() {
+ return model;
+ }
+
+ public static LogisticModelParameters getParameters() {
+ return lmp;
+ }
+
+ static BufferedReader open(String inputFile) throws IOException {
+ InputStream in;
+ try {
+ in = Resources.getResource(inputFile).openStream();
+ } catch (IllegalArgumentException e) {
+ in = new FileInputStream(new File(inputFile));
+ }
+ return new BufferedReader(new InputStreamReader(in, Charsets.UTF_8));
+ }
+
+}
r***@apache.org
2018-06-28 14:55:12 UTC
Permalink
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/TrainLogistic.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/TrainLogistic.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/TrainLogistic.java
deleted file mode 100644
index f4b8bcb..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/TrainLogistic.java
+++ /dev/null
@@ -1,311 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.classifier.sgd;
-
-import com.google.common.io.Resources;
-import org.apache.commons.cli2.CommandLine;
-import org.apache.commons.cli2.Group;
-import org.apache.commons.cli2.Option;
-import org.apache.commons.cli2.builder.ArgumentBuilder;
-import org.apache.commons.cli2.builder.DefaultOptionBuilder;
-import org.apache.commons.cli2.builder.GroupBuilder;
-import org.apache.commons.cli2.commandline.Parser;
-import org.apache.commons.cli2.util.HelpFormatter;
-import org.apache.commons.io.Charsets;
-import org.apache.mahout.math.RandomAccessSparseVector;
-import org.apache.mahout.math.Vector;
-
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.io.OutputStream;
-import java.io.OutputStreamWriter;
-import java.io.PrintWriter;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Locale;
-
-/**
- * Train a logistic regression for the examples from Chapter 13 of Mahout in Action
- */
-public final class TrainLogistic {
-
- private static String inputFile;
- private static String outputFile;
- private static LogisticModelParameters lmp;
- private static int passes;
- private static boolean scores;
- private static OnlineLogisticRegression model;
-
- private TrainLogistic() {
- }
-
- public static void main(String[] args) throws Exception {
- mainToOutput(args, new PrintWriter(new OutputStreamWriter(System.out, Charsets.UTF_8), true));
- }
-
- static void mainToOutput(String[] args, PrintWriter output) throws Exception {
- if (parseArgs(args)) {
- double logPEstimate = 0;
- int samples = 0;
-
- CsvRecordFactory csv = lmp.getCsvRecordFactory();
- OnlineLogisticRegression lr = lmp.createRegression();
- for (int pass = 0; pass < passes; pass++) {
- try (BufferedReader in = open(inputFile)) {
- // read variable names
- csv.firstLine(in.readLine());
-
- String line = in.readLine();
- while (line != null) {
- // for each new line, get target and predictors
- Vector input = new RandomAccessSparseVector(lmp.getNumFeatures());
- int targetValue = csv.processLine(line, input);
-
- // check performance while this is still news
- double logP = lr.logLikelihood(targetValue, input);
- if (!Double.isInfinite(logP)) {
- if (samples < 20) {
- logPEstimate = (samples * logPEstimate + logP) / (samples + 1);
- } else {
- logPEstimate = 0.95 * logPEstimate + 0.05 * logP;
- }
- samples++;
- }
- double p = lr.classifyScalar(input);
- if (scores) {
- output.printf(Locale.ENGLISH, "%10d %2d %10.2f %2.4f %10.4f %10.4f%n",
- samples, targetValue, lr.currentLearningRate(), p, logP, logPEstimate);
- }
-
- // now update model
- lr.train(targetValue, input);
-
- line = in.readLine();
- }
- }
- }
-
- try (OutputStream modelOutput = new FileOutputStream(outputFile)) {
- lmp.saveTo(modelOutput);
- }
-
- output.println(lmp.getNumFeatures());
- output.println(lmp.getTargetVariable() + " ~ ");
- String sep = "";
- for (String v : csv.getTraceDictionary().keySet()) {
- double weight = predictorWeight(lr, 0, csv, v);
- if (weight != 0) {
- output.printf(Locale.ENGLISH, "%s%.3f*%s", sep, weight, v);
- sep = " + ";
- }
- }
- output.printf("%n");
- model = lr;
- for (int row = 0; row < lr.getBeta().numRows(); row++) {
- for (String key : csv.getTraceDictionary().keySet()) {
- double weight = predictorWeight(lr, row, csv, key);
- if (weight != 0) {
- output.printf(Locale.ENGLISH, "%20s %.5f%n", key, weight);
- }
- }
- for (int column = 0; column < lr.getBeta().numCols(); column++) {
- output.printf(Locale.ENGLISH, "%15.9f ", lr.getBeta().get(row, column));
- }
- output.println();
- }
- }
- }
-
- private static double predictorWeight(OnlineLogisticRegression lr, int row, RecordFactory csv, String predictor) {
- double weight = 0;
- for (Integer column : csv.getTraceDictionary().get(predictor)) {
- weight += lr.getBeta().get(row, column);
- }
- return weight;
- }
-
- private static boolean parseArgs(String[] args) {
- DefaultOptionBuilder builder = new DefaultOptionBuilder();
-
- Option help = builder.withLongName("help").withDescription("print this list").create();
-
- Option quiet = builder.withLongName("quiet").withDescription("be extra quiet").create();
- Option scores = builder.withLongName("scores").withDescription("output score diagnostics during training").create();
-
- ArgumentBuilder argumentBuilder = new ArgumentBuilder();
- Option inputFile = builder.withLongName("input")
- .withRequired(true)
- .withArgument(argumentBuilder.withName("input").withMaximum(1).create())
- .withDescription("where to get training data")
- .create();
-
- Option outputFile = builder.withLongName("output")
- .withRequired(true)
- .withArgument(argumentBuilder.withName("output").withMaximum(1).create())
- .withDescription("where to get training data")
- .create();
-
- Option predictors = builder.withLongName("predictors")
- .withRequired(true)
- .withArgument(argumentBuilder.withName("p").create())
- .withDescription("a list of predictor variables")
- .create();
-
- Option types = builder.withLongName("types")
- .withRequired(true)
- .withArgument(argumentBuilder.withName("t").create())
- .withDescription("a list of predictor variable types (numeric, word, or text)")
- .create();
-
- Option target = builder.withLongName("target")
- .withRequired(true)
- .withArgument(argumentBuilder.withName("target").withMaximum(1).create())
- .withDescription("the name of the target variable")
- .create();
-
- Option features = builder.withLongName("features")
- .withArgument(
- argumentBuilder.withName("numFeatures")
- .withDefault("1000")
- .withMaximum(1).create())
- .withDescription("the number of internal hashed features to use")
- .create();
-
- Option passes = builder.withLongName("passes")
- .withArgument(
- argumentBuilder.withName("passes")
- .withDefault("2")
- .withMaximum(1).create())
- .withDescription("the number of times to pass over the input data")
- .create();
-
- Option lambda = builder.withLongName("lambda")
- .withArgument(argumentBuilder.withName("lambda").withDefault("1e-4").withMaximum(1).create())
- .withDescription("the amount of coefficient decay to use")
- .create();
-
- Option rate = builder.withLongName("rate")
- .withArgument(argumentBuilder.withName("learningRate").withDefault("1e-3").withMaximum(1).create())
- .withDescription("the learning rate")
- .create();
-
- Option noBias = builder.withLongName("noBias")
- .withDescription("don't include a bias term")
- .create();
-
- Option targetCategories = builder.withLongName("categories")
- .withRequired(true)
- .withArgument(argumentBuilder.withName("number").withMaximum(1).create())
- .withDescription("the number of target categories to be considered")
- .create();
-
- Group normalArgs = new GroupBuilder()
- .withOption(help)
- .withOption(quiet)
- .withOption(inputFile)
- .withOption(outputFile)
- .withOption(target)
- .withOption(targetCategories)
- .withOption(predictors)
- .withOption(types)
- .withOption(passes)
- .withOption(lambda)
- .withOption(rate)
- .withOption(noBias)
- .withOption(features)
- .create();
-
- Parser parser = new Parser();
- parser.setHelpOption(help);
- parser.setHelpTrigger("--help");
- parser.setGroup(normalArgs);
- parser.setHelpFormatter(new HelpFormatter(" ", "", " ", 130));
- CommandLine cmdLine = parser.parseAndHelp(args);
-
- if (cmdLine == null) {
- return false;
- }
-
- TrainLogistic.inputFile = getStringArgument(cmdLine, inputFile);
- TrainLogistic.outputFile = getStringArgument(cmdLine, outputFile);
-
- List<String> typeList = new ArrayList<>();
- for (Object x : cmdLine.getValues(types)) {
- typeList.add(x.toString());
- }
-
- List<String> predictorList = new ArrayList<>();
- for (Object x : cmdLine.getValues(predictors)) {
- predictorList.add(x.toString());
- }
-
- lmp = new LogisticModelParameters();
- lmp.setTargetVariable(getStringArgument(cmdLine, target));
- lmp.setMaxTargetCategories(getIntegerArgument(cmdLine, targetCategories));
- lmp.setNumFeatures(getIntegerArgument(cmdLine, features));
- lmp.setUseBias(!getBooleanArgument(cmdLine, noBias));
- lmp.setTypeMap(predictorList, typeList);
-
- lmp.setLambda(getDoubleArgument(cmdLine, lambda));
- lmp.setLearningRate(getDoubleArgument(cmdLine, rate));
-
- TrainLogistic.scores = getBooleanArgument(cmdLine, scores);
- TrainLogistic.passes = getIntegerArgument(cmdLine, passes);
-
- return true;
- }
-
- private static String getStringArgument(CommandLine cmdLine, Option inputFile) {
- return (String) cmdLine.getValue(inputFile);
- }
-
- private static boolean getBooleanArgument(CommandLine cmdLine, Option option) {
- return cmdLine.hasOption(option);
- }
-
- private static int getIntegerArgument(CommandLine cmdLine, Option features) {
- return Integer.parseInt((String) cmdLine.getValue(features));
- }
-
- private static double getDoubleArgument(CommandLine cmdLine, Option op) {
- return Double.parseDouble((String) cmdLine.getValue(op));
- }
-
- public static OnlineLogisticRegression getModel() {
- return model;
- }
-
- public static LogisticModelParameters getParameters() {
- return lmp;
- }
-
- static BufferedReader open(String inputFile) throws IOException {
- InputStream in;
- try {
- in = Resources.getResource(inputFile).openStream();
- } catch (IllegalArgumentException e) {
- in = new FileInputStream(new File(inputFile));
- }
- return new BufferedReader(new InputStreamReader(in, Charsets.UTF_8));
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/TrainNewsGroups.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/TrainNewsGroups.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/TrainNewsGroups.java
deleted file mode 100644
index 632b32c..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/TrainNewsGroups.java
+++ /dev/null
@@ -1,154 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.classifier.sgd;
-
-import com.google.common.collect.HashMultiset;
-import com.google.common.collect.Multiset;
-import com.google.common.collect.Ordering;
-import org.apache.mahout.classifier.NewsgroupHelper;
-import org.apache.mahout.ep.State;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.vectorizer.encoders.Dictionary;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.List;
-
-/**
- * Reads and trains an adaptive logistic regression model on the 20 newsgroups data.
- * The first command line argument gives the path of the directory holding the training
- * data. The optional second argument, leakType, defines which classes of features to use.
- * Importantly, leakType controls whether a synthetic date is injected into the data as
- * a target leak and if so, how.
- * <p/>
- * The value of leakType % 3 determines whether the target leak is injected according to
- * the following table:
- * <p/>
- * <table>
- * <tr><td valign='top'>0</td><td>No leak injected</td></tr>
- * <tr><td valign='top'>1</td><td>Synthetic date injected in MMM-yyyy format. This will be a single token and
- * is a perfect target leak since each newsgroup is given a different month</td></tr>
- * <tr><td valign='top'>2</td><td>Synthetic date injected in dd-MMM-yyyy HH:mm:ss format. The day varies
- * and thus there are more leak symbols that need to be learned. Ultimately this is just
- * as big a leak as case 1.</td></tr>
- * </table>
- * <p/>
- * Leaktype also determines what other text will be indexed. If leakType is greater
- * than or equal to 6, then neither headers nor text body will be used for features and the leak is the only
- * source of data. If leakType is greater than or equal to 3, then subject words will be used as features.
- * If leakType is less than 3, then both subject and body text will be used as features.
- * <p/>
- * A leakType of 0 gives no leak and all textual features.
- * <p/>
- * See the following table for a summary of commonly used values for leakType
- * <p/>
- * <table>
- * <tr><td><b>leakType</b></td><td><b>Leak?</b></td><td><b>Subject?</b></td><td><b>Body?</b></td></tr>
- * <tr><td colspan=4><hr></td></tr>
- * <tr><td>0</td><td>no</td><td>yes</td><td>yes</td></tr>
- * <tr><td>1</td><td>mmm-yyyy</td><td>yes</td><td>yes</td></tr>
- * <tr><td>2</td><td>dd-mmm-yyyy</td><td>yes</td><td>yes</td></tr>
- * <tr><td colspan=4><hr></td></tr>
- * <tr><td>3</td><td>no</td><td>yes</td><td>no</td></tr>
- * <tr><td>4</td><td>mmm-yyyy</td><td>yes</td><td>no</td></tr>
- * <tr><td>5</td><td>dd-mmm-yyyy</td><td>yes</td><td>no</td></tr>
- * <tr><td colspan=4><hr></td></tr>
- * <tr><td>6</td><td>no</td><td>no</td><td>no</td></tr>
- * <tr><td>7</td><td>mmm-yyyy</td><td>no</td><td>no</td></tr>
- * <tr><td>8</td><td>dd-mmm-yyyy</td><td>no</td><td>no</td></tr>
- * <tr><td colspan=4><hr></td></tr>
- * </table>
- */
-public final class TrainNewsGroups {
-
- private TrainNewsGroups() {
- }
-
- public static void main(String[] args) throws IOException {
- File base = new File(args[0]);
-
- Multiset<String> overallCounts = HashMultiset.create();
-
- int leakType = 0;
- if (args.length > 1) {
- leakType = Integer.parseInt(args[1]);
- }
-
- Dictionary newsGroups = new Dictionary();
-
- NewsgroupHelper helper = new NewsgroupHelper();
- helper.getEncoder().setProbes(2);
- AdaptiveLogisticRegression learningAlgorithm =
- new AdaptiveLogisticRegression(20, NewsgroupHelper.FEATURES, new L1());
- learningAlgorithm.setInterval(800);
- learningAlgorithm.setAveragingWindow(500);
-
- List<File> files = new ArrayList<>();
- for (File newsgroup : base.listFiles()) {
- if (newsgroup.isDirectory()) {
- newsGroups.intern(newsgroup.getName());
- files.addAll(Arrays.asList(newsgroup.listFiles()));
- }
- }
- Collections.shuffle(files);
- System.out.println(files.size() + " training files");
- SGDInfo info = new SGDInfo();
-
- int k = 0;
-
- for (File file : files) {
- String ng = file.getParentFile().getName();
- int actual = newsGroups.intern(ng);
-
- Vector v = helper.encodeFeatureVector(file, actual, leakType, overallCounts);
- learningAlgorithm.train(actual, v);
-
- k++;
- State<AdaptiveLogisticRegression.Wrapper, CrossFoldLearner> best = learningAlgorithm.getBest();
-
- SGDHelper.analyzeState(info, leakType, k, best);
- }
- learningAlgorithm.close();
- SGDHelper.dissect(leakType, newsGroups, learningAlgorithm, files, overallCounts);
- System.out.println("exiting main");
-
- File modelFile = new File(System.getProperty("java.io.tmpdir"), "news-group.model");
- ModelSerializer.writeBinary(modelFile.getAbsolutePath(),
- learningAlgorithm.getBest().getPayload().getLearner().getModels().get(0));
-
- List<Integer> counts = new ArrayList<>();
- System.out.println("Word counts");
- for (String count : overallCounts.elementSet()) {
- counts.add(overallCounts.count(count));
- }
- Collections.sort(counts, Ordering.natural().reverse());
- k = 0;
- for (Integer count : counts) {
- System.out.println(k + "\t" + count);
- k++;
- if (k > 1000) {
- break;
- }
- }
- }
-
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/ValidateAdaptiveLogistic.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/ValidateAdaptiveLogistic.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/ValidateAdaptiveLogistic.java
deleted file mode 100644
index 7a74289..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/ValidateAdaptiveLogistic.java
+++ /dev/null
@@ -1,218 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.classifier.sgd;
-
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.IOException;
-import java.io.OutputStreamWriter;
-import java.io.PrintWriter;
-import java.util.Locale;
-
-import org.apache.commons.cli2.CommandLine;
-import org.apache.commons.cli2.Group;
-import org.apache.commons.cli2.Option;
-import org.apache.commons.cli2.builder.ArgumentBuilder;
-import org.apache.commons.cli2.builder.DefaultOptionBuilder;
-import org.apache.commons.cli2.builder.GroupBuilder;
-import org.apache.commons.cli2.commandline.Parser;
-import org.apache.commons.cli2.util.HelpFormatter;
-import org.apache.commons.io.Charsets;
-import org.apache.mahout.classifier.ConfusionMatrix;
-import org.apache.mahout.classifier.evaluation.Auc;
-import org.apache.mahout.classifier.sgd.AdaptiveLogisticRegression.Wrapper;
-import org.apache.mahout.ep.State;
-import org.apache.mahout.math.Matrix;
-import org.apache.mahout.math.SequentialAccessSparseVector;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.stats.OnlineSummarizer;
-
-/*
- * Auc and averageLikelihood are always shown if possible, if the number of target value is more than 2,
- * then Auc and entropy matirx are not shown regardless the value of showAuc and showEntropy
- * the user passes, because the current implementation does not support them on two value targets.
- * */
-public final class ValidateAdaptiveLogistic {
-
- private static String inputFile;
- private static String modelFile;
- private static String defaultCategory;
- private static boolean showAuc;
- private static boolean showScores;
- private static boolean showConfusion;
-
- private ValidateAdaptiveLogistic() {
- }
-
- public static void main(String[] args) throws IOException {
- mainToOutput(args, new PrintWriter(new OutputStreamWriter(System.out, Charsets.UTF_8), true));
- }
-
- static void mainToOutput(String[] args, PrintWriter output) throws IOException {
- if (parseArgs(args)) {
- if (!showAuc && !showConfusion && !showScores) {
- showAuc = true;
- showConfusion = true;
- }
-
- Auc collector = null;
- AdaptiveLogisticModelParameters lmp = AdaptiveLogisticModelParameters
- .loadFromFile(new File(modelFile));
- CsvRecordFactory csv = lmp.getCsvRecordFactory();
- AdaptiveLogisticRegression lr = lmp.createAdaptiveLogisticRegression();
-
- if (lmp.getTargetCategories().size() <= 2) {
- collector = new Auc();
- }
-
- OnlineSummarizer slh = new OnlineSummarizer();
- ConfusionMatrix cm = new ConfusionMatrix(lmp.getTargetCategories(), defaultCategory);
-
- State<Wrapper, CrossFoldLearner> best = lr.getBest();
- if (best == null) {
- output.println("AdaptiveLogisticRegression has not be trained probably.");
- return;
- }
- CrossFoldLearner learner = best.getPayload().getLearner();
-
- BufferedReader in = TrainLogistic.open(inputFile);
- String line = in.readLine();
- csv.firstLine(line);
- line = in.readLine();
- if (showScores) {
- output.println("\"target\", \"model-output\", \"log-likelihood\", \"average-likelihood\"");
- }
- while (line != null) {
- Vector v = new SequentialAccessSparseVector(lmp.getNumFeatures());
- //TODO: How to avoid extra target values not shown in the training process.
- int target = csv.processLine(line, v);
- double likelihood = learner.logLikelihood(target, v);
- double score = learner.classifyFull(v).maxValue();
-
- slh.add(likelihood);
- cm.addInstance(csv.getTargetString(line), csv.getTargetLabel(target));
-
- if (showScores) {
- output.printf(Locale.ENGLISH, "%8d, %.12f, %.13f, %.13f%n", target,
- score, learner.logLikelihood(target, v), slh.getMean());
- }
- if (collector != null) {
- collector.add(target, score);
- }
- line = in.readLine();
- }
-
- output.printf(Locale.ENGLISH,"\nLog-likelihood:");
- output.printf(Locale.ENGLISH, "Min=%.2f, Max=%.2f, Mean=%.2f, Median=%.2f%n",
- slh.getMin(), slh.getMax(), slh.getMean(), slh.getMedian());
-
- if (collector != null) {
- output.printf(Locale.ENGLISH, "%nAUC = %.2f%n", collector.auc());
- }
-
- if (showConfusion) {
- output.printf(Locale.ENGLISH, "%n%s%n%n", cm.toString());
-
- if (collector != null) {
- Matrix m = collector.entropy();
- output.printf(Locale.ENGLISH,
- "Entropy Matrix: [[%.1f, %.1f], [%.1f, %.1f]]%n", m.get(0, 0),
- m.get(1, 0), m.get(0, 1), m.get(1, 1));
- }
- }
-
- }
- }
-
- private static boolean parseArgs(String[] args) {
- DefaultOptionBuilder builder = new DefaultOptionBuilder();
-
- Option help = builder.withLongName("help")
- .withDescription("print this list").create();
-
- Option quiet = builder.withLongName("quiet")
- .withDescription("be extra quiet").create();
-
- Option auc = builder.withLongName("auc").withDescription("print AUC")
- .create();
- Option confusion = builder.withLongName("confusion")
- .withDescription("print confusion matrix").create();
-
- Option scores = builder.withLongName("scores")
- .withDescription("print scores").create();
-
- ArgumentBuilder argumentBuilder = new ArgumentBuilder();
- Option inputFileOption = builder
- .withLongName("input")
- .withRequired(true)
- .withArgument(
- argumentBuilder.withName("input").withMaximum(1)
- .create())
- .withDescription("where to get validate data").create();
-
- Option modelFileOption = builder
- .withLongName("model")
- .withRequired(true)
- .withArgument(
- argumentBuilder.withName("model").withMaximum(1)
- .create())
- .withDescription("where to get the trained model").create();
-
- Option defaultCagetoryOption = builder
- .withLongName("defaultCategory")
- .withRequired(false)
- .withArgument(
- argumentBuilder.withName("defaultCategory").withMaximum(1).withDefault("unknown")
- .create())
- .withDescription("the default category value to use").create();
-
- Group normalArgs = new GroupBuilder().withOption(help)
- .withOption(quiet).withOption(auc).withOption(scores)
- .withOption(confusion).withOption(inputFileOption)
- .withOption(modelFileOption).withOption(defaultCagetoryOption).create();
-
- Parser parser = new Parser();
- parser.setHelpOption(help);
- parser.setHelpTrigger("--help");
- parser.setGroup(normalArgs);
- parser.setHelpFormatter(new HelpFormatter(" ", "", " ", 130));
- CommandLine cmdLine = parser.parseAndHelp(args);
-
- if (cmdLine == null) {
- return false;
- }
-
- inputFile = getStringArgument(cmdLine, inputFileOption);
- modelFile = getStringArgument(cmdLine, modelFileOption);
- defaultCategory = getStringArgument(cmdLine, defaultCagetoryOption);
- showAuc = getBooleanArgument(cmdLine, auc);
- showScores = getBooleanArgument(cmdLine, scores);
- showConfusion = getBooleanArgument(cmdLine, confusion);
-
- return true;
- }
-
- private static boolean getBooleanArgument(CommandLine cmdLine, Option option) {
- return cmdLine.hasOption(option);
- }
-
- private static String getStringArgument(CommandLine cmdLine, Option inputFile) {
- return (String) cmdLine.getValue(inputFile);
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/bankmarketing/BankMarketingClassificationMain.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/bankmarketing/BankMarketingClassificationMain.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/bankmarketing/BankMarketingClassificationMain.java
deleted file mode 100644
index ab3c861..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/bankmarketing/BankMarketingClassificationMain.java
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.classifier.sgd.bankmarketing;
-
-import com.google.common.collect.Lists;
-import org.apache.mahout.classifier.evaluation.Auc;
-import org.apache.mahout.classifier.sgd.L1;
-import org.apache.mahout.classifier.sgd.OnlineLogisticRegression;
-
-import java.util.Collections;
-import java.util.List;
-
-/**
- * Uses the SGD classifier on the 'Bank marketing' dataset from UCI.
- *
- * See http://archive.ics.uci.edu/ml/datasets/Bank+Marketing
- *
- * Learn when people accept or reject an offer from the bank via telephone based on income, age, education and more.
- */
-public class BankMarketingClassificationMain {
-
- public static final int NUM_CATEGORIES = 2;
-
- public static void main(String[] args) throws Exception {
- List<TelephoneCall> calls = Lists.newArrayList(new TelephoneCallParser("bank-full.csv"));
-
- double heldOutPercentage = 0.10;
-
- for (int run = 0; run < 20; run++) {
- Collections.shuffle(calls);
- int cutoff = (int) (heldOutPercentage * calls.size());
- List<TelephoneCall> test = calls.subList(0, cutoff);
- List<TelephoneCall> train = calls.subList(cutoff, calls.size());
-
- OnlineLogisticRegression lr = new OnlineLogisticRegression(NUM_CATEGORIES, TelephoneCall.FEATURES, new L1())
- .learningRate(1)
- .alpha(1)
- .lambda(0.000001)
- .stepOffset(10000)
- .decayExponent(0.2);
- for (int pass = 0; pass < 20; pass++) {
- for (TelephoneCall observation : train) {
- lr.train(observation.getTarget(), observation.asVector());
- }
- if (pass % 5 == 0) {
- Auc eval = new Auc(0.5);
- for (TelephoneCall testCall : test) {
- eval.add(testCall.getTarget(), lr.classifyScalar(testCall.asVector()));
- }
- System.out.printf("%d, %.4f, %.4f\n", pass, lr.currentLearningRate(), eval.auc());
- }
- }
- }
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/bankmarketing/TelephoneCall.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/bankmarketing/TelephoneCall.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/bankmarketing/TelephoneCall.java
deleted file mode 100644
index 728ec20..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/bankmarketing/TelephoneCall.java
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.classifier.sgd.bankmarketing;
-
-import org.apache.mahout.math.RandomAccessSparseVector;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.vectorizer.encoders.ConstantValueEncoder;
-import org.apache.mahout.vectorizer.encoders.FeatureVectorEncoder;
-import org.apache.mahout.vectorizer.encoders.StaticWordValueEncoder;
-
-import java.util.Iterator;
-import java.util.LinkedHashMap;
-import java.util.Map;
-
-public class TelephoneCall {
- public static final int FEATURES = 100;
- private static final ConstantValueEncoder interceptEncoder = new ConstantValueEncoder("intercept");
- private static final FeatureVectorEncoder featureEncoder = new StaticWordValueEncoder("feature");
-
- private RandomAccessSparseVector vector;
-
- private Map<String, String> fields = new LinkedHashMap<>();
-
- public TelephoneCall(Iterable<String> fieldNames, Iterable<String> values) {
- vector = new RandomAccessSparseVector(FEATURES);
- Iterator<String> value = values.iterator();
- interceptEncoder.addToVector("1", vector);
- for (String name : fieldNames) {
- String fieldValue = value.next();
- fields.put(name, fieldValue);
-
- switch (name) {
- case "age": {
- double v = Double.parseDouble(fieldValue);
- featureEncoder.addToVector(name, Math.log(v), vector);
- break;
- }
- case "balance": {
- double v;
- v = Double.parseDouble(fieldValue);
- if (v < -2000) {
- v = -2000;
- }
- featureEncoder.addToVector(name, Math.log(v + 2001) - 8, vector);
- break;
- }
- case "duration": {
- double v;
- v = Double.parseDouble(fieldValue);
- featureEncoder.addToVector(name, Math.log(v + 1) - 5, vector);
- break;
- }
- case "pdays": {
- double v;
- v = Double.parseDouble(fieldValue);
- featureEncoder.addToVector(name, Math.log(v + 2), vector);
- break;
- }
- case "job":
- case "marital":
- case "education":
- case "default":
- case "housing":
- case "loan":
- case "contact":
- case "campaign":
- case "previous":
- case "poutcome":
- featureEncoder.addToVector(name + ":" + fieldValue, 1, vector);
- break;
- case "day":
- case "month":
- case "y":
- // ignore these for vectorizing
- break;
- default:
- throw new IllegalArgumentException(String.format("Bad field name: %s", name));
- }
- }
- }
-
- public Vector asVector() {
- return vector;
- }
-
- public int getTarget() {
- return fields.get("y").equals("no") ? 0 : 1;
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/bankmarketing/TelephoneCallParser.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/bankmarketing/TelephoneCallParser.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/bankmarketing/TelephoneCallParser.java
deleted file mode 100644
index 5ef6490..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/classifier/sgd/bankmarketing/TelephoneCallParser.java
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.classifier.sgd.bankmarketing;
-
-import com.google.common.base.CharMatcher;
-import com.google.common.base.Splitter;
-import com.google.common.collect.AbstractIterator;
-import com.google.common.io.Resources;
-
-import java.io.BufferedReader;
-import java.io.IOException;
-import java.io.InputStreamReader;
-import java.util.Iterator;
-
-/** Parses semi-colon separated data as TelephoneCalls */
-public class TelephoneCallParser implements Iterable<TelephoneCall> {
-
- private final Splitter onSemi = Splitter.on(";").trimResults(CharMatcher.anyOf("\" ;"));
- private String resourceName;
-
- public TelephoneCallParser(String resourceName) throws IOException {
- this.resourceName = resourceName;
- }
-
- @Override
- public Iterator<TelephoneCall> iterator() {
- try {
- return new AbstractIterator<TelephoneCall>() {
- BufferedReader input =
- new BufferedReader(new InputStreamReader(Resources.getResource(resourceName).openStream()));
- Iterable<String> fieldNames = onSemi.split(input.readLine());
-
- @Override
- protected TelephoneCall computeNext() {
- try {
- String line = input.readLine();
- if (line == null) {
- return endOfData();
- }
-
- return new TelephoneCall(fieldNames, onSemi.split(line));
- } catch (IOException e) {
- throw new RuntimeException("Error reading data", e);
- }
- }
- };
- } catch (IOException e) {
- throw new RuntimeException("Error reading data", e);
- }
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/display/ClustersFilter.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/display/ClustersFilter.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/display/ClustersFilter.java
deleted file mode 100644
index a0b845f..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/display/ClustersFilter.java
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.clustering.display;
-
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.fs.PathFilter;
-
-final class ClustersFilter implements PathFilter {
-
- @Override
- public boolean accept(Path path) {
- String pathString = path.toString();
- return pathString.contains("/clusters-");
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/display/DisplayCanopy.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/display/DisplayCanopy.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/display/DisplayCanopy.java
deleted file mode 100644
index 50dba99..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/display/DisplayCanopy.java
+++ /dev/null
@@ -1,88 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.clustering.display;
-
-import java.awt.BasicStroke;
-import java.awt.Color;
-import java.awt.Graphics;
-import java.awt.Graphics2D;
-import java.util.List;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.mahout.clustering.Cluster;
-import org.apache.mahout.clustering.canopy.CanopyDriver;
-import org.apache.mahout.common.HadoopUtil;
-import org.apache.mahout.common.RandomUtils;
-import org.apache.mahout.common.distance.ManhattanDistanceMeasure;
-import org.apache.mahout.math.DenseVector;
-
-/**
- * Java desktop graphics class that runs canopy clustering and displays the results.
- * This class generates random data and clusters it.
- */
-@Deprecated
-public class DisplayCanopy extends DisplayClustering {
-
- DisplayCanopy() {
- initialize();
- this.setTitle("Canopy Clusters (>" + (int) (significance * 100) + "% of population)");
- }
-
- @Override
- public void paint(Graphics g) {
- plotSampleData((Graphics2D) g);
- plotClusters((Graphics2D) g);
- }
-
- protected static void plotClusters(Graphics2D g2) {
- int cx = CLUSTERS.size() - 1;
- for (List<Cluster> clusters : CLUSTERS) {
- for (Cluster cluster : clusters) {
- if (isSignificant(cluster)) {
- g2.setStroke(new BasicStroke(1));
- g2.setColor(Color.BLUE);
- double[] t1 = {T1, T1};
- plotEllipse(g2, cluster.getCenter(), new DenseVector(t1));
- double[] t2 = {T2, T2};
- plotEllipse(g2, cluster.getCenter(), new DenseVector(t2));
- g2.setColor(COLORS[Math.min(DisplayClustering.COLORS.length - 1, cx)]);
- g2.setStroke(new BasicStroke(cx == 0 ? 3 : 1));
- plotEllipse(g2, cluster.getCenter(), cluster.getRadius().times(3));
- }
- }
- cx--;
- }
- }
-
- public static void main(String[] args) throws Exception {
- Path samples = new Path("samples");
- Path output = new Path("output");
- Configuration conf = new Configuration();
- HadoopUtil.delete(conf, samples);
- HadoopUtil.delete(conf, output);
- RandomUtils.useTestSeed();
- generateSamples();
- writeSampleData(samples);
- CanopyDriver.buildClusters(conf, samples, output, new ManhattanDistanceMeasure(), T1, T2, 0, true);
- loadClustersWritable(output);
-
- new DisplayCanopy();
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/display/DisplayClustering.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/display/DisplayClustering.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/display/DisplayClustering.java
deleted file mode 100644
index ad85c6a..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/display/DisplayClustering.java
+++ /dev/null
@@ -1,374 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.clustering.display;
-
-import java.awt.*;
-import java.awt.event.WindowAdapter;
-import java.awt.event.WindowEvent;
-import java.awt.geom.AffineTransform;
-import java.awt.geom.Ellipse2D;
-import java.awt.geom.Rectangle2D;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileStatus;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.IntWritable;
-import org.apache.hadoop.io.SequenceFile;
-import org.apache.hadoop.io.Text;
-import org.apache.mahout.clustering.AbstractCluster;
-import org.apache.mahout.clustering.Cluster;
-import org.apache.mahout.clustering.UncommonDistributions;
-import org.apache.mahout.clustering.classify.WeightedVectorWritable;
-import org.apache.mahout.clustering.iterator.ClusterWritable;
-import org.apache.mahout.common.Pair;
-import org.apache.mahout.common.RandomUtils;
-import org.apache.mahout.common.iterator.sequencefile.PathFilters;
-import org.apache.mahout.common.iterator.sequencefile.PathType;
-import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterable;
-import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable;
-import org.apache.mahout.math.DenseVector;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.VectorWritable;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-public class DisplayClustering extends Frame {
-
- private static final Logger log = LoggerFactory.getLogger(DisplayClustering.class);
-
- protected static final int DS = 72; // default scale = 72 pixels per inch
-
- protected static final int SIZE = 8; // screen size in inches
-
- private static final Collection<Vector> SAMPLE_PARAMS = new ArrayList<>();
-
- protected static final List<VectorWritable> SAMPLE_DATA = new ArrayList<>();
-
- protected static final List<List<Cluster>> CLUSTERS = new ArrayList<>();
-
- static final Color[] COLORS = { Color.red, Color.orange, Color.yellow, Color.green, Color.blue, Color.magenta,
- Color.lightGray };
-
- protected static final double T1 = 3.0;
-
- protected static final double T2 = 2.8;
-
- static double significance = 0.05;
-
- protected static int res; // screen resolution
-
- public DisplayClustering() {
- initialize();
- this.setTitle("Sample Data");
- }
-
- public void initialize() {
- // Get screen resolution
- res = Toolkit.getDefaultToolkit().getScreenResolution();
-
- // Set Frame size in inches
- this.setSize(SIZE * res, SIZE * res);
- this.setVisible(true);
- this.setTitle("Asymmetric Sample Data");
-
- // Window listener to terminate program.
- this.addWindowListener(new WindowAdapter() {
- @Override
- public void windowClosing(WindowEvent e) {
- System.exit(0);
- }
- });
- }
-
- public static void main(String[] args) throws Exception {
- RandomUtils.useTestSeed();
- generateSamples();
- new DisplayClustering();
- }
-
- // Override the paint() method
- @Override
- public void paint(Graphics g) {
- Graphics2D g2 = (Graphics2D) g;
- plotSampleData(g2);
- plotSampleParameters(g2);
- plotClusters(g2);
- }
-
- protected static void plotClusters(Graphics2D g2) {
- int cx = CLUSTERS.size() - 1;
- for (List<Cluster> clusters : CLUSTERS) {
- g2.setStroke(new BasicStroke(cx == 0 ? 3 : 1));
- g2.setColor(COLORS[Math.min(COLORS.length - 1, cx--)]);
- for (Cluster cluster : clusters) {
- plotEllipse(g2, cluster.getCenter(), cluster.getRadius().times(3));
- }
- }
- }
-
- protected static void plotSampleParameters(Graphics2D g2) {
- Vector v = new DenseVector(2);
- Vector dv = new DenseVector(2);
- g2.setColor(Color.RED);
- for (Vector param : SAMPLE_PARAMS) {
- v.set(0, param.get(0));
- v.set(1, param.get(1));
- dv.set(0, param.get(2) * 3);
- dv.set(1, param.get(3) * 3);
- plotEllipse(g2, v, dv);
- }
- }
-
- protected static void plotSampleData(Graphics2D g2) {
- double sx = (double) res / DS;
- g2.setTransform(AffineTransform.getScaleInstance(sx, sx));
-
- // plot the axes
- g2.setColor(Color.BLACK);
- Vector dv = new DenseVector(2).assign(SIZE / 2.0);
- plotRectangle(g2, new DenseVector(2).assign(2), dv);
- plotRectangle(g2, new DenseVector(2).assign(-2), dv);
-
- // plot the sample data
- g2.setColor(Color.DARK_GRAY);
- dv.assign(0.03);
- for (VectorWritable v : SAMPLE_DATA) {
- plotRectangle(g2, v.get(), dv);
- }
- }
-
- /**
- * This method plots points and colors them according to their cluster
- * membership, rather than drawing ellipses.
- *
- * As of commit, this method is used only by K-means spectral clustering.
- * Since the cluster assignments are set within the eigenspace of the data, it
- * is not inherent that the original data cluster as they would in K-means:
- * that is, as symmetric gaussian mixtures.
- *
- * Since Spectral K-Means uses K-Means to cluster the eigenspace data, the raw
- * output is not directly usable. Rather, the cluster assignments from the raw
- * output need to be transferred back to the original data. As such, this
- * method will read the SequenceFile cluster results of K-means and transfer
- * the cluster assignments to the original data, coloring them appropriately.
- *
- * @param g2
- * @param data
- */
- protected static void plotClusteredSampleData(Graphics2D g2, Path data) {
- double sx = (double) res / DS;
- g2.setTransform(AffineTransform.getScaleInstance(sx, sx));
-
- g2.setColor(Color.BLACK);
- Vector dv = new DenseVector(2).assign(SIZE / 2.0);
- plotRectangle(g2, new DenseVector(2).assign(2), dv);
- plotRectangle(g2, new DenseVector(2).assign(-2), dv);
-
- // plot the sample data, colored according to the cluster they belong to
- dv.assign(0.03);
-
- Path clusteredPointsPath = new Path(data, "clusteredPoints");
- Path inputPath = new Path(clusteredPointsPath, "part-m-00000");
- Map<Integer,Color> colors = new HashMap<>();
- int point = 0;
- for (Pair<IntWritable,WeightedVectorWritable> record : new SequenceFileIterable<IntWritable,WeightedVectorWritable>(
- inputPath, new Configuration())) {
- int clusterId = record.getFirst().get();
- VectorWritable v = SAMPLE_DATA.get(point++);
- Integer key = clusterId;
- if (!colors.containsKey(key)) {
- colors.put(key, COLORS[Math.min(COLORS.length - 1, colors.size())]);
- }
- plotClusteredRectangle(g2, v.get(), dv, colors.get(key));
- }
- }
-
- /**
- * Identical to plotRectangle(), but with the option of setting the color of
- * the rectangle's stroke.
- *
- * NOTE: This should probably be refactored with plotRectangle() since most of
- * the code here is direct copy/paste from that method.
- *
- * @param g2
- * A Graphics2D context.
- * @param v
- * A vector for the rectangle's center.
- * @param dv
- * A vector for the rectangle's dimensions.
- * @param color
- * The color of the rectangle's stroke.
- */
- protected static void plotClusteredRectangle(Graphics2D g2, Vector v, Vector dv, Color color) {
- double[] flip = {1, -1};
- Vector v2 = v.times(new DenseVector(flip));
- v2 = v2.minus(dv.divide(2));
- int h = SIZE / 2;
- double x = v2.get(0) + h;
- double y = v2.get(1) + h;
-
- g2.setStroke(new BasicStroke(1));
- g2.setColor(color);
- g2.draw(new Rectangle2D.Double(x * DS, y * DS, dv.get(0) * DS, dv.get(1) * DS));
- }
-
- /**
- * Draw a rectangle on the graphics context
- *
- * @param g2
- * a Graphics2D context
- * @param v
- * a Vector of rectangle center
- * @param dv
- * a Vector of rectangle dimensions
- */
- protected static void plotRectangle(Graphics2D g2, Vector v, Vector dv) {
- double[] flip = {1, -1};
- Vector v2 = v.times(new DenseVector(flip));
- v2 = v2.minus(dv.divide(2));
- int h = SIZE / 2;
- double x = v2.get(0) + h;
- double y = v2.get(1) + h;
- g2.draw(new Rectangle2D.Double(x * DS, y * DS, dv.get(0) * DS, dv.get(1) * DS));
- }
-
- /**
- * Draw an ellipse on the graphics context
- *
- * @param g2
- * a Graphics2D context
- * @param v
- * a Vector of ellipse center
- * @param dv
- * a Vector of ellipse dimensions
- */
- protected static void plotEllipse(Graphics2D g2, Vector v, Vector dv) {
- double[] flip = {1, -1};
- Vector v2 = v.times(new DenseVector(flip));
- v2 = v2.minus(dv.divide(2));
- int h = SIZE / 2;
- double x = v2.get(0) + h;
- double y = v2.get(1) + h;
- g2.draw(new Ellipse2D.Double(x * DS, y * DS, dv.get(0) * DS, dv.get(1) * DS));
- }
-
- protected static void generateSamples() {
- generateSamples(500, 1, 1, 3);
- generateSamples(300, 1, 0, 0.5);
- generateSamples(300, 0, 2, 0.1);
- }
-
- protected static void generate2dSamples() {
- generate2dSamples(500, 1, 1, 3, 1);
- generate2dSamples(300, 1, 0, 0.5, 1);
- generate2dSamples(300, 0, 2, 0.1, 0.5);
- }
-
- /**
- * Generate random samples and add them to the sampleData
- *
- * @param num
- * int number of samples to generate
- * @param mx
- * double x-value of the sample mean
- * @param my
- * double y-value of the sample mean
- * @param sd
- * double standard deviation of the samples
- */
- protected static void generateSamples(int num, double mx, double my, double sd) {
- double[] params = {mx, my, sd, sd};
- SAMPLE_PARAMS.add(new DenseVector(params));
- log.info("Generating {} samples m=[{}, {}] sd={}", num, mx, my, sd);
- for (int i = 0; i < num; i++) {
- SAMPLE_DATA.add(new VectorWritable(new DenseVector(new double[] {UncommonDistributions.rNorm(mx, sd),
- UncommonDistributions.rNorm(my, sd)})));
- }
- }
-
- protected static void writeSampleData(Path output) throws IOException {
- Configuration conf = new Configuration();
- FileSystem fs = FileSystem.get(output.toUri(), conf);
-
- try (SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, output, Text.class, VectorWritable.class)) {
- int i = 0;
- for (VectorWritable vw : SAMPLE_DATA) {
- writer.append(new Text("sample_" + i++), vw);
- }
- }
- }
-
- protected static List<Cluster> readClustersWritable(Path clustersIn) {
- List<Cluster> clusters = new ArrayList<>();
- Configuration conf = new Configuration();
- for (ClusterWritable value : new SequenceFileDirValueIterable<ClusterWritable>(clustersIn, PathType.LIST,
- PathFilters.logsCRCFilter(), conf)) {
- Cluster cluster = value.getValue();
- log.info(
- "Reading Cluster:{} center:{} numPoints:{} radius:{}",
- cluster.getId(), AbstractCluster.formatVector(cluster.getCenter(), null),
- cluster.getNumObservations(), AbstractCluster.formatVector(cluster.getRadius(), null));
- clusters.add(cluster);
- }
- return clusters;
- }
-
- protected static void loadClustersWritable(Path output) throws IOException {
- Configuration conf = new Configuration();
- FileSystem fs = FileSystem.get(output.toUri(), conf);
- for (FileStatus s : fs.listStatus(output, new ClustersFilter())) {
- List<Cluster> clusters = readClustersWritable(s.getPath());
- CLUSTERS.add(clusters);
- }
- }
-
- /**
- * Generate random samples and add them to the sampleData
- *
- * @param num
- * int number of samples to generate
- * @param mx
- * double x-value of the sample mean
- * @param my
- * double y-value of the sample mean
- * @param sdx
- * double x-value standard deviation of the samples
- * @param sdy
- * double y-value standard deviation of the samples
- */
- protected static void generate2dSamples(int num, double mx, double my, double sdx, double sdy) {
- double[] params = {mx, my, sdx, sdy};
- SAMPLE_PARAMS.add(new DenseVector(params));
- log.info("Generating {} samples m=[{}, {}] sd=[{}, {}]", num, mx, my, sdx, sdy);
- for (int i = 0; i < num; i++) {
- SAMPLE_DATA.add(new VectorWritable(new DenseVector(new double[] {UncommonDistributions.rNorm(mx, sdx),
- UncommonDistributions.rNorm(my, sdy)})));
- }
- }
-
- protected static boolean isSignificant(Cluster cluster) {
- return (double) cluster.getNumObservations() / SAMPLE_DATA.size() > significance;
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/display/DisplayFuzzyKMeans.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/display/DisplayFuzzyKMeans.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/display/DisplayFuzzyKMeans.java
deleted file mode 100644
index f8ce7c7..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/display/DisplayFuzzyKMeans.java
+++ /dev/null
@@ -1,110 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.clustering.display;
-
-import java.awt.Graphics;
-import java.awt.Graphics2D;
-import java.io.IOException;
-import java.util.Collection;
-import java.util.List;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.mahout.clustering.Cluster;
-import org.apache.mahout.clustering.classify.ClusterClassifier;
-import org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansDriver;
-import org.apache.mahout.clustering.fuzzykmeans.SoftCluster;
-import org.apache.mahout.clustering.iterator.ClusterIterator;
-import org.apache.mahout.clustering.iterator.FuzzyKMeansClusteringPolicy;
-import org.apache.mahout.clustering.kmeans.RandomSeedGenerator;
-import org.apache.mahout.common.HadoopUtil;
-import org.apache.mahout.common.RandomUtils;
-import org.apache.mahout.common.distance.DistanceMeasure;
-import org.apache.mahout.common.distance.ManhattanDistanceMeasure;
-import org.apache.mahout.math.Vector;
-
-import com.google.common.collect.Lists;
-
-public class DisplayFuzzyKMeans extends DisplayClustering {
-
- DisplayFuzzyKMeans() {
- initialize();
- this.setTitle("Fuzzy k-Means Clusters (>" + (int) (significance * 100) + "% of population)");
- }
-
- // Override the paint() method
- @Override
- public void paint(Graphics g) {
- plotSampleData((Graphics2D) g);
- plotClusters((Graphics2D) g);
- }
-
- public static void main(String[] args) throws Exception {
- DistanceMeasure measure = new ManhattanDistanceMeasure();
-
- Path samples = new Path("samples");
- Path output = new Path("output");
- Configuration conf = new Configuration();
- HadoopUtil.delete(conf, output);
- HadoopUtil.delete(conf, samples);
- RandomUtils.useTestSeed();
- DisplayClustering.generateSamples();
- writeSampleData(samples);
- boolean runClusterer = true;
- int maxIterations = 10;
- float threshold = 0.001F;
- float m = 1.1F;
- if (runClusterer) {
- runSequentialFuzzyKClusterer(conf, samples, output, measure, maxIterations, m, threshold);
- } else {
- int numClusters = 3;
- runSequentialFuzzyKClassifier(conf, samples, output, measure, numClusters, maxIterations, m, threshold);
- }
- new DisplayFuzzyKMeans();
- }
-
- private static void runSequentialFuzzyKClassifier(Configuration conf, Path samples, Path output,
- DistanceMeasure measure, int numClusters, int maxIterations, float m, double threshold) throws IOException {
- Collection<Vector> points = Lists.newArrayList();
- for (int i = 0; i < numClusters; i++) {
- points.add(SAMPLE_DATA.get(i).get());
- }
- List<Cluster> initialClusters = Lists.newArrayList();
- int id = 0;
- for (Vector point : points) {
- initialClusters.add(new SoftCluster(point, id++, measure));
- }
- ClusterClassifier prior = new ClusterClassifier(initialClusters, new FuzzyKMeansClusteringPolicy(m, threshold));
- Path priorPath = new Path(output, "classifier-0");
- prior.writeToSeqFiles(priorPath);
-
- ClusterIterator.iterateSeq(conf, samples, priorPath, output, maxIterations);
- loadClustersWritable(output);
- }
-
- private static void runSequentialFuzzyKClusterer(Configuration conf, Path samples, Path output,
- DistanceMeasure measure, int maxIterations, float m, double threshold) throws IOException,
- ClassNotFoundException, InterruptedException {
- Path clustersIn = new Path(output, "random-seeds");
- RandomSeedGenerator.buildRandom(conf, samples, clustersIn, 3, measure);
- FuzzyKMeansDriver.run(samples, clustersIn, output, threshold, maxIterations, m, true, true, threshold,
- true);
-
- loadClustersWritable(output);
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/display/DisplayKMeans.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/display/DisplayKMeans.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/display/DisplayKMeans.java
deleted file mode 100644
index 336d69e..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/display/DisplayKMeans.java
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.clustering.display;
-
-import java.awt.Graphics;
-import java.awt.Graphics2D;
-import java.io.IOException;
-import java.util.Collection;
-import java.util.List;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.mahout.clustering.Cluster;
-import org.apache.mahout.clustering.classify.ClusterClassifier;
-import org.apache.mahout.clustering.iterator.ClusterIterator;
-import org.apache.mahout.clustering.iterator.KMeansClusteringPolicy;
-import org.apache.mahout.clustering.kmeans.KMeansDriver;
-import org.apache.mahout.clustering.kmeans.RandomSeedGenerator;
-import org.apache.mahout.common.HadoopUtil;
-import org.apache.mahout.common.RandomUtils;
-import org.apache.mahout.common.distance.DistanceMeasure;
-import org.apache.mahout.common.distance.ManhattanDistanceMeasure;
-import org.apache.mahout.math.Vector;
-
-import com.google.common.collect.Lists;
-
-public class DisplayKMeans extends DisplayClustering {
-
- DisplayKMeans() {
- initialize();
- this.setTitle("k-Means Clusters (>" + (int) (significance * 100) + "% of population)");
- }
-
- public static void main(String[] args) throws Exception {
- DistanceMeasure measure = new ManhattanDistanceMeasure();
- Path samples = new Path("samples");
- Path output = new Path("output");
- Configuration conf = new Configuration();
- HadoopUtil.delete(conf, samples);
- HadoopUtil.delete(conf, output);
-
- RandomUtils.useTestSeed();
- generateSamples();
- writeSampleData(samples);
- boolean runClusterer = true;
- double convergenceDelta = 0.001;
- int numClusters = 3;
- int maxIterations = 10;
- if (runClusterer) {
- runSequentialKMeansClusterer(conf, samples, output, measure, numClusters, maxIterations, convergenceDelta);
- } else {
- runSequentialKMeansClassifier(conf, samples, output, measure, numClusters, maxIterations, convergenceDelta);
- }
- new DisplayKMeans();
- }
-
- private static void runSequentialKMeansClassifier(Configuration conf, Path samples, Path output,
- DistanceMeasure measure, int numClusters, int maxIterations, double convergenceDelta) throws IOException {
- Collection<Vector> points = Lists.newArrayList();
- for (int i = 0; i < numClusters; i++) {
- points.add(SAMPLE_DATA.get(i).get());
- }
- List<Cluster> initialClusters = Lists.newArrayList();
- int id = 0;
- for (Vector point : points) {
- initialClusters.add(new org.apache.mahout.clustering.kmeans.Kluster(point, id++, measure));
- }
- ClusterClassifier prior = new ClusterClassifier(initialClusters, new KMeansClusteringPolicy(convergenceDelta));
- Path priorPath = new Path(output, Cluster.INITIAL_CLUSTERS_DIR);
- prior.writeToSeqFiles(priorPath);
-
- ClusterIterator.iterateSeq(conf, samples, priorPath, output, maxIterations);
- loadClustersWritable(output);
- }
-
- private static void runSequentialKMeansClusterer(Configuration conf, Path samples, Path output,
- DistanceMeasure measure, int numClusters, int maxIterations, double convergenceDelta)
- throws IOException, InterruptedException, ClassNotFoundException {
- Path clustersIn = new Path(output, "random-seeds");
- RandomSeedGenerator.buildRandom(conf, samples, clustersIn, numClusters, measure);
- KMeansDriver.run(samples, clustersIn, output, convergenceDelta, maxIterations, true, 0.0, true);
- loadClustersWritable(output);
- }
-
- // Override the paint() method
- @Override
- public void paint(Graphics g) {
- plotSampleData((Graphics2D) g);
- plotClusters((Graphics2D) g);
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/display/DisplaySpectralKMeans.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/display/DisplaySpectralKMeans.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/display/DisplaySpectralKMeans.java
deleted file mode 100644
index 2b70749..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/display/DisplaySpectralKMeans.java
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.clustering.display;
-
-import java.awt.Graphics;
-import java.awt.Graphics2D;
-import java.io.BufferedWriter;
-import java.io.FileWriter;
-import java.io.Writer;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.mahout.clustering.spectral.kmeans.SpectralKMeansDriver;
-import org.apache.mahout.common.HadoopUtil;
-import org.apache.mahout.common.RandomUtils;
-import org.apache.mahout.common.distance.DistanceMeasure;
-import org.apache.mahout.common.distance.ManhattanDistanceMeasure;
-
-public class DisplaySpectralKMeans extends DisplayClustering {
-
- protected static final String SAMPLES = "samples";
- protected static final String OUTPUT = "output";
- protected static final String TEMP = "tmp";
- protected static final String AFFINITIES = "affinities";
-
- DisplaySpectralKMeans() {
- initialize();
- setTitle("Spectral k-Means Clusters (>" + (int) (significance * 100) + "% of population)");
- }
-
- public static void main(String[] args) throws Exception {
- DistanceMeasure measure = new ManhattanDistanceMeasure();
- Path samples = new Path(SAMPLES);
- Path output = new Path(OUTPUT);
- Path tempDir = new Path(TEMP);
- Configuration conf = new Configuration();
- HadoopUtil.delete(conf, samples);
- HadoopUtil.delete(conf, output);
-
- RandomUtils.useTestSeed();
- DisplayClustering.generateSamples();
- writeSampleData(samples);
- Path affinities = new Path(output, AFFINITIES);
- FileSystem fs = FileSystem.get(output.toUri(), conf);
- if (!fs.exists(output)) {
- fs.mkdirs(output);
- }
-
- try (Writer writer = new BufferedWriter(new FileWriter(affinities.toString()))){
- for (int i = 0; i < SAMPLE_DATA.size(); i++) {
- for (int j = 0; j < SAMPLE_DATA.size(); j++) {
- writer.write(i + "," + j + ',' + measure.distance(SAMPLE_DATA.get(i).get(),
- SAMPLE_DATA.get(j).get()) + '\n');
- }
- }
- }
-
- int maxIter = 10;
- double convergenceDelta = 0.001;
- SpectralKMeansDriver.run(new Configuration(), affinities, output, SAMPLE_DATA.size(), 3, measure,
- convergenceDelta, maxIter, tempDir);
- new DisplaySpectralKMeans();
- }
-
- @Override
- public void paint(Graphics g) {
- plotClusteredSampleData((Graphics2D) g, new Path(new Path(OUTPUT), "kmeans_out"));
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/display/README.txt
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/display/README.txt b/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/display/README.txt
deleted file mode 100644
index 470c16c..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/display/README.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-The following classes can be run without parameters to generate a sample data set and
-run the reference clustering implementations over them:
-
-DisplayClustering - generates 1000 samples from three, symmetric distributions. This is the same
- data set that is used by the following clustering programs. It displays the points on a screen
- and superimposes the model parameters that were used to generate the points. You can edit the
- generateSamples() method to change the sample points used by these programs.
-
- * DisplayCanopy - uses Canopy clustering
- * DisplayKMeans - uses k-Means clustering
- * DisplayFuzzyKMeans - uses Fuzzy k-Means clustering
-
- * NOTE: some of these programs display the sample points and then superimpose all of the clusters
- from each iteration. The last iteration's clusters are in bold red and the previous several are
- colored (orange, yellow, green, blue, violet) in order after which all earlier clusters are in
- light grey. This helps to visualize how the clusters converge upon a solution over multiple
- iterations.
- * NOTE: by changing the parameter values (k, ALPHA_0, numIterations) and the display SIGNIFICANCE
- you can obtain different results.
-
-
-
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/streaming/tools/ClusterQualitySummarizer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/streaming/tools/ClusterQualitySummarizer.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/streaming/tools/ClusterQualitySummarizer.java
deleted file mode 100644
index c29cbc4..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/clustering/streaming/tools/ClusterQualitySummarizer.java
+++ /dev/null
@@ -1,279 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.clustering.streaming.tools;
-
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.PrintWriter;
-import java.util.List;
-
-import com.google.common.collect.Iterables;
-import com.google.common.collect.Lists;
-import com.google.common.io.Closeables;
-import org.apache.commons.cli2.CommandLine;
-import org.apache.commons.cli2.Group;
-import org.apache.commons.cli2.Option;
-import org.apache.commons.cli2.builder.ArgumentBuilder;
-import org.apache.commons.cli2.builder.DefaultOptionBuilder;
-import org.apache.commons.cli2.builder.GroupBuilder;
-import org.apache.commons.cli2.commandline.Parser;
-import org.apache.commons.cli2.util.HelpFormatter;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.mahout.clustering.iterator.ClusterWritable;
-import org.apache.mahout.clustering.ClusteringUtils;
-import org.apache.mahout.clustering.streaming.mapreduce.CentroidWritable;
-import org.apache.mahout.common.AbstractJob;
-import org.apache.mahout.common.distance.DistanceMeasure;
-import org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure;
-import org.apache.mahout.common.iterator.sequencefile.PathType;
-import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterable;
-import org.apache.mahout.math.Centroid;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.VectorWritable;
-import org.apache.mahout.math.stats.OnlineSummarizer;
-
-public class ClusterQualitySummarizer extends AbstractJob {
- private String outputFile;
-
- private PrintWriter fileOut;
-
- private String trainFile;
- private String testFile;
- private String centroidFile;
- private String centroidCompareFile;
- private boolean mahoutKMeansFormat;
- private boolean mahoutKMeansFormatCompare;
-
- private DistanceMeasure distanceMeasure = new SquaredEuclideanDistanceMeasure();
-
- public void printSummaries(List<OnlineSummarizer> summarizers, String type) {
- printSummaries(summarizers, type, fileOut);
- }
-
- public static void printSummaries(List<OnlineSummarizer> summarizers, String type, PrintWriter fileOut) {
- double maxDistance = 0;
- for (int i = 0; i < summarizers.size(); ++i) {
- OnlineSummarizer summarizer = summarizers.get(i);
- if (summarizer.getCount() > 1) {
- maxDistance = Math.max(maxDistance, summarizer.getMax());
- System.out.printf("Average distance in cluster %d [%d]: %f\n", i, summarizer.getCount(), summarizer.getMean());
- // If there is just one point in the cluster, quartiles cannot be estimated. We'll just assume all the quartiles
- // equal the only value.
- if (fileOut != null) {
- fileOut.printf("%d,%f,%f,%f,%f,%f,%f,%f,%d,%s\n", i, summarizer.getMean(),
- summarizer.getSD(),
- summarizer.getQuartile(0),
- summarizer.getQuartile(1),
- summarizer.getQuartile(2),
- summarizer.getQuartile(3),
- summarizer.getQuartile(4), summarizer.getCount(), type);
- }
- } else {
- System.out.printf("Cluster %d is has %d data point. Need atleast 2 data points in a cluster for" +
- " OnlineSummarizer.\n", i, summarizer.getCount());
- }
- }
- System.out.printf("Num clusters: %d; maxDistance: %f\n", summarizers.size(), maxDistance);
- }
-
- public int run(String[] args) throws IOException {
- if (!parseArgs(args)) {
- return -1;
- }
-
- Configuration conf = new Configuration();
- try {
- fileOut = new PrintWriter(new FileOutputStream(outputFile));
- fileOut.printf("cluster,distance.mean,distance.sd,distance.q0,distance.q1,distance.q2,distance.q3,"
- + "distance.q4,count,is.train\n");
-
- // Reading in the centroids (both pairs, if they exist).
- List<Centroid> centroids;
- List<Centroid> centroidsCompare = null;
- if (mahoutKMeansFormat) {
- SequenceFileDirValueIterable<ClusterWritable> clusterIterable =
- new SequenceFileDirValueIterable<>(new Path(centroidFile), PathType.GLOB, conf);
- centroids = Lists.newArrayList(IOUtils.getCentroidsFromClusterWritableIterable(clusterIterable));
- } else {
- SequenceFileDirValueIterable<CentroidWritable> centroidIterable =
- new SequenceFileDirValueIterable<>(new Path(centroidFile), PathType.GLOB, conf);
- centroids = Lists.newArrayList(IOUtils.getCentroidsFromCentroidWritableIterable(centroidIterable));
- }
-
- if (centroidCompareFile != null) {
- if (mahoutKMeansFormatCompare) {
- SequenceFileDirValueIterable<ClusterWritable> clusterCompareIterable =
- new SequenceFileDirValueIterable<>(new Path(centroidCompareFile), PathType.GLOB, conf);
- centroidsCompare = Lists.newArrayList(
- IOUtils.getCentroidsFromClusterWritableIterable(clusterCompareIterable));
- } else {
- SequenceFileDirValueIterable<CentroidWritable> centroidCompareIterable =
- new SequenceFileDirValueIterable<>(new Path(centroidCompareFile), PathType.GLOB, conf);
- centroidsCompare = Lists.newArrayList(
- IOUtils.getCentroidsFromCentroidWritableIterable(centroidCompareIterable));
- }
- }
-
- // Reading in the "training" set.
- SequenceFileDirValueIterable<VectorWritable> trainIterable =
- new SequenceFileDirValueIterable<>(new Path(trainFile), PathType.GLOB, conf);
- Iterable<Vector> trainDatapoints = IOUtils.getVectorsFromVectorWritableIterable(trainIterable);
- Iterable<Vector> datapoints = trainDatapoints;
-
- printSummaries(ClusteringUtils.summarizeClusterDistances(trainDatapoints, centroids,
- new SquaredEuclideanDistanceMeasure()), "train");
-
- // Also adding in the "test" set.
- if (testFile != null) {
- SequenceFileDirValueIterable<VectorWritable> testIterable =
- new SequenceFileDirValueIterable<>(new Path(testFile), PathType.GLOB, conf);
- Iterable<Vector> testDatapoints = IOUtils.getVectorsFromVectorWritableIterable(testIterable);
-
- printSummaries(ClusteringUtils.summarizeClusterDistances(testDatapoints, centroids,
- new SquaredEuclideanDistanceMeasure()), "test");
-
- datapoints = Iterables.concat(trainDatapoints, testDatapoints);
- }
-
- // At this point, all train/test CSVs have been written. We now compute quality metrics.
- List<OnlineSummarizer> summaries =
- ClusteringUtils.summarizeClusterDistances(datapoints, centroids, distanceMeasure);
- List<OnlineSummarizer> compareSummaries = null;
- if (centroidsCompare != null) {
- compareSummaries = ClusteringUtils.summarizeClusterDistances(datapoints, centroidsCompare, distanceMeasure);
- }
- System.out.printf("[Dunn Index] First: %f", ClusteringUtils.dunnIndex(centroids, distanceMeasure, summaries));
- if (compareSummaries != null) {
- System.out.printf(" Second: %f\n", ClusteringUtils.dunnIndex(centroidsCompare, distanceMeasure, compareSummaries));
- } else {
- System.out.printf("\n");
- }
- System.out.printf("[Davies-Bouldin Index] First: %f",
- ClusteringUtils.daviesBouldinIndex(centroids, distanceMeasure, summaries));
- if (compareSummaries != null) {
- System.out.printf(" Second: %f\n",
- ClusteringUtils.daviesBouldinIndex(centroidsCompare, distanceMeasure, compareSummaries));
- } else {
- System.out.printf("\n");
- }
- } catch (IOException e) {
- System.out.println(e.getMessage());
- } finally {
- Closeables.close(fileOut, false);
- }
- return 0;
- }
-
- private boolean parseArgs(String[] args) {
- DefaultOptionBuilder builder = new DefaultOptionBuilder();
-
- Option help = builder.withLongName("help").withDescription("print this list").create();
-
- ArgumentBuilder argumentBuilder = new ArgumentBuilder();
- Option inputFileOption = builder.withLongName("input")
- .withShortName("i")
- .withRequired(true)
- .withArgument(argumentBuilder.withName("input").withMaximum(1).create())
- .withDescription("where to get seq files with the vectors (training set)")
- .create();
-
- Option testInputFileOption = builder.withLongName("testInput")
- .withShortName("itest")
- .withArgument(argumentBuilder.withName("testInput").withMaximum(1).create())
- .withDescription("where to get seq files with the vectors (test set)")
- .create();
-
- Option centroidsFileOption = builder.withLongName("centroids")
- .withShortName("c")
- .withRequired(true)
- .withArgument(argumentBuilder.withName("centroids").withMaximum(1).create())
- .withDescription("where to get seq files with the centroids (from Mahout KMeans or StreamingKMeansDriver)")
- .create();
-
- Option centroidsCompareFileOption = builder.withLongName("centroidsCompare")
- .withShortName("cc")
- .withRequired(false)
- .withArgument(argumentBuilder.withName("centroidsCompare").withMaximum(1).create())
- .withDescription("where to get seq files with the second set of centroids (from Mahout KMeans or "
- + "StreamingKMeansDriver)")
- .create();
-
- Option outputFileOption = builder.withLongName("output")
- .withShortName("o")
- .withRequired(true)
- .withArgument(argumentBuilder.withName("output").withMaximum(1).create())
- .withDescription("where to dump the CSV file with the results")
- .create();
-
- Option mahoutKMeansFormatOption = builder.withLongName("mahoutkmeansformat")
- .withShortName("mkm")
- .withDescription("if set, read files as (IntWritable, ClusterWritable) pairs")
- .withArgument(argumentBuilder.withName("numpoints").withMaximum(1).create())
- .create();
-
- Option mahoutKMeansCompareFormatOption = builder.withLongName("mahoutkmeansformatCompare")
- .withShortName("mkmc")
- .withDescription("if set, read files as (IntWritable, ClusterWritable) pairs")
- .withArgument(argumentBuilder.withName("numpoints").withMaximum(1).create())
- .create();
-
- Group normalArgs = new GroupBuilder()
- .withOption(help)
- .withOption(inputFileOption)
- .withOption(testInputFileOption)
- .withOption(outputFileOption)
- .withOption(centroidsFileOption)
- .withOption(centroidsCompareFileOption)
- .withOption(mahoutKMeansFormatOption)
- .withOption(mahoutKMeansCompareFormatOption)
- .create();
-
- Parser parser = new Parser();
- parser.setHelpOption(help);
- parser.setHelpTrigger("--help");
- parser.setGroup(normalArgs);
- parser.setHelpFormatter(new HelpFormatter(" ", "", " ", 150));
-
- CommandLine cmdLine = parser.parseAndHelp(args);
- if (cmdLine == null) {
- return false;
- }
-
- trainFile = (String) cmdLine.getValue(inputFileOption);
- if (cmdLine.hasOption(testInputFileOption)) {
- testFile = (String) cmdLine.getValue(testInputFileOption);
- }
- centroidFile = (String) cmdLine.getValue(centroidsFileOption);
- if (cmdLine.hasOption(centroidsCompareFileOption)) {
- centroidCompareFile = (String) cmdLine.getValue(centroidsCompareFileOption);
- }
- outputFile = (String) cmdLine.getValue(outputFileOption);
- if (cmdLine.hasOption(mahoutKMeansFormatOption)) {
- mahoutKMeansFormat = true;
- }
- if (cmdLine.hasOption(mahoutKMeansCompareFormatOption)) {
- mahoutKMeansFormatCompare = true;
- }
- return true;
- }
-
- public static void main(String[] args) throws IOException {
- new ClusterQualitySummarizer().run(args);
- }
-}
r***@apache.org
2018-06-28 14:55:15 UTC
Permalink
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToPrefsDriver.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToPrefsDriver.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToPrefsDriver.java
deleted file mode 100644
index 752bb48..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToPrefsDriver.java
+++ /dev/null
@@ -1,274 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.email;
-
-import com.google.common.io.Closeables;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.filecache.DistributedCache;
-import org.apache.hadoop.fs.FileStatus;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.FileUtil;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.IntWritable;
-import org.apache.hadoop.io.LongWritable;
-import org.apache.hadoop.io.NullWritable;
-import org.apache.hadoop.io.SequenceFile;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.Writable;
-import org.apache.hadoop.mapreduce.Job;
-import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
-import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
-import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
-import org.apache.hadoop.util.ToolRunner;
-import org.apache.mahout.common.AbstractJob;
-import org.apache.mahout.common.HadoopUtil;
-import org.apache.mahout.common.Pair;
-import org.apache.mahout.common.commandline.DefaultOptionCreator;
-import org.apache.mahout.common.iterator.sequencefile.PathFilters;
-import org.apache.mahout.common.iterator.sequencefile.PathType;
-import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
-import org.apache.mahout.math.VarIntWritable;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.IOException;
-import java.net.URI;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Map;
-import java.util.concurrent.atomic.AtomicInteger;
-
-/**
- * Convert the Mail archives (see {@link org.apache.mahout.text.SequenceFilesFromMailArchives}) to a preference
- * file that can be consumed by the {@link org.apache.mahout.cf.taste.hadoop.item.RecommenderJob}.
- * <p/>
- * This assumes the input is a Sequence File, that the key is: filename/message id and the value is a list
- * (separated by the user's choosing) containing the from email and any references
- * <p/>
- * The output is a matrix where either the from or to are the rows (represented as longs) and the columns are the
- * message ids that the user has interacted with (as a VectorWritable). This class currently does not account for
- * thread hijacking.
- * <p/>
- * It also outputs a side table mapping the row ids to their original and the message ids to the message thread id
- */
-public final class MailToPrefsDriver extends AbstractJob {
-
- private static final Logger log = LoggerFactory.getLogger(MailToPrefsDriver.class);
-
- private static final String OUTPUT_FILES_PATTERN = "part-*";
- private static final int DICTIONARY_BYTE_OVERHEAD = 4;
-
- public static void main(String[] args) throws Exception {
- ToolRunner.run(new Configuration(), new MailToPrefsDriver(), args);
- }
-
- @Override
- public int run(String[] args) throws Exception {
- addInputOption();
- addOutputOption();
- addOption(DefaultOptionCreator.overwriteOption().create());
- addOption("chunkSize", "cs", "The size of chunks to write. Default is 100 mb", "100");
- addOption("separator", "sep", "The separator used in the input file to separate to, from, subject. Default is \\n",
- "\n");
- addOption("from", "f", "The position in the input text (value) where the from email is located, starting from "
- + "zero (0).", "0");
- addOption("refs", "r", "The position in the input text (value) where the reference ids are located, "
- + "starting from zero (0).", "1");
- addOption(buildOption("useCounts", "u", "If set, then use the number of times the user has interacted with a "
- + "thread as an indication of their preference. Otherwise, use boolean preferences.", false, false,
- String.valueOf(true)));
- Map<String, List<String>> parsedArgs = parseArguments(args);
-
- Path input = getInputPath();
- Path output = getOutputPath();
- int chunkSize = Integer.parseInt(getOption("chunkSize"));
- String separator = getOption("separator");
- Configuration conf = getConf();
- boolean useCounts = hasOption("useCounts");
- AtomicInteger currentPhase = new AtomicInteger();
- int[] msgDim = new int[1];
- //TODO: mod this to not do so many passes over the data. Dictionary creation could probably be a chain mapper
- List<Path> msgIdChunks = null;
- boolean overwrite = hasOption(DefaultOptionCreator.OVERWRITE_OPTION);
- // create the dictionary between message ids and longs
- if (shouldRunNextPhase(parsedArgs, currentPhase)) {
- //TODO: there seems to be a pattern emerging for dictionary creation
- // -- sparse vectors from seq files also has this.
- Path msgIdsPath = new Path(output, "msgIds");
- if (overwrite) {
- HadoopUtil.delete(conf, msgIdsPath);
- }
- log.info("Creating Msg Id Dictionary");
- Job createMsgIdDictionary = prepareJob(input,
- msgIdsPath,
- SequenceFileInputFormat.class,
- MsgIdToDictionaryMapper.class,
- Text.class,
- VarIntWritable.class,
- MailToDictionaryReducer.class,
- Text.class,
- VarIntWritable.class,
- SequenceFileOutputFormat.class);
-
- boolean succeeded = createMsgIdDictionary.waitForCompletion(true);
- if (!succeeded) {
- return -1;
- }
- //write out the dictionary at the top level
- msgIdChunks = createDictionaryChunks(msgIdsPath, output, "msgIds-dictionary-",
- createMsgIdDictionary.getConfiguration(), chunkSize, msgDim);
- }
- //create the dictionary between from email addresses and longs
- List<Path> fromChunks = null;
- if (shouldRunNextPhase(parsedArgs, currentPhase)) {
- Path fromIdsPath = new Path(output, "fromIds");
- if (overwrite) {
- HadoopUtil.delete(conf, fromIdsPath);
- }
- log.info("Creating From Id Dictionary");
- Job createFromIdDictionary = prepareJob(input,
- fromIdsPath,
- SequenceFileInputFormat.class,
- FromEmailToDictionaryMapper.class,
- Text.class,
- VarIntWritable.class,
- MailToDictionaryReducer.class,
- Text.class,
- VarIntWritable.class,
- SequenceFileOutputFormat.class);
- createFromIdDictionary.getConfiguration().set(EmailUtility.SEPARATOR, separator);
- boolean succeeded = createFromIdDictionary.waitForCompletion(true);
- if (!succeeded) {
- return -1;
- }
- //write out the dictionary at the top level
- int[] fromDim = new int[1];
- fromChunks = createDictionaryChunks(fromIdsPath, output, "fromIds-dictionary-",
- createFromIdDictionary.getConfiguration(), chunkSize, fromDim);
- }
- //OK, we have our dictionaries, let's output the real thing we need: <from_id -> <msgId, msgId, msgId, ...>>
- if (shouldRunNextPhase(parsedArgs, currentPhase) && fromChunks != null && msgIdChunks != null) {
- //Job map
- //may be a way to do this so that we can load the from ids in memory, if they are small enough so that
- // we don't need the double loop
- log.info("Creating recommendation matrix");
- Path vecPath = new Path(output, "recInput");
- if (overwrite) {
- HadoopUtil.delete(conf, vecPath);
- }
- //conf.set(EmailUtility.FROM_DIMENSION, String.valueOf(fromDim[0]));
- conf.set(EmailUtility.MSG_ID_DIMENSION, String.valueOf(msgDim[0]));
- conf.set(EmailUtility.FROM_PREFIX, "fromIds-dictionary-");
- conf.set(EmailUtility.MSG_IDS_PREFIX, "msgIds-dictionary-");
- conf.set(EmailUtility.FROM_INDEX, getOption("from"));
- conf.set(EmailUtility.REFS_INDEX, getOption("refs"));
- conf.set(EmailUtility.SEPARATOR, separator);
- conf.set(MailToRecReducer.USE_COUNTS_PREFERENCE, String.valueOf(useCounts));
- int j = 0;
- int i = 0;
- for (Path fromChunk : fromChunks) {
- for (Path idChunk : msgIdChunks) {
- Path out = new Path(vecPath, "tmp-" + i + '-' + j);
- DistributedCache.setCacheFiles(new URI[]{fromChunk.toUri(), idChunk.toUri()}, conf);
- Job createRecMatrix = prepareJob(input, out, SequenceFileInputFormat.class,
- MailToRecMapper.class, Text.class, LongWritable.class, MailToRecReducer.class, Text.class,
- NullWritable.class, TextOutputFormat.class);
- createRecMatrix.getConfiguration().set("mapred.output.compress", "false");
- boolean succeeded = createRecMatrix.waitForCompletion(true);
- if (!succeeded) {
- return -1;
- }
- //copy the results up a level
- //HadoopUtil.copyMergeSeqFiles(out.getFileSystem(conf), out, vecPath.getFileSystem(conf), outPath, true,
- // conf, "");
- FileStatus[] fs = HadoopUtil.getFileStatus(new Path(out, "*"), PathType.GLOB, PathFilters.partFilter(), null,
- conf);
- for (int k = 0; k < fs.length; k++) {
- FileStatus f = fs[k];
- Path outPath = new Path(vecPath, "chunk-" + i + '-' + j + '-' + k);
- FileUtil.copy(f.getPath().getFileSystem(conf), f.getPath(), outPath.getFileSystem(conf), outPath, true,
- overwrite, conf);
- }
- HadoopUtil.delete(conf, out);
- j++;
- }
- i++;
- }
- //concat the files together
- /*Path mergePath = new Path(output, "vectors.dat");
- if (overwrite) {
- HadoopUtil.delete(conf, mergePath);
- }
- log.info("Merging together output vectors to vectors.dat in {}", output);*/
- //HadoopUtil.copyMergeSeqFiles(vecPath.getFileSystem(conf), vecPath, mergePath.getFileSystem(conf), mergePath,
- // false, conf, "\n");
- }
-
- return 0;
- }
-
- private static List<Path> createDictionaryChunks(Path inputPath,
- Path dictionaryPathBase,
- String name,
- Configuration baseConf,
- int chunkSizeInMegabytes, int[] maxTermDimension)
- throws IOException {
- List<Path> chunkPaths = new ArrayList<>();
-
- Configuration conf = new Configuration(baseConf);
-
- FileSystem fs = FileSystem.get(inputPath.toUri(), conf);
-
- long chunkSizeLimit = chunkSizeInMegabytes * 1024L * 1024L;
- int chunkIndex = 0;
- Path chunkPath = new Path(dictionaryPathBase, name + chunkIndex);
- chunkPaths.add(chunkPath);
-
- SequenceFile.Writer dictWriter = new SequenceFile.Writer(fs, conf, chunkPath, Text.class, IntWritable.class);
-
- try {
- long currentChunkSize = 0;
- Path filesPattern = new Path(inputPath, OUTPUT_FILES_PATTERN);
- int i = 1; //start at 1, since a miss in the OpenObjectIntHashMap returns a 0
- for (Pair<Writable, Writable> record
- : new SequenceFileDirIterable<>(filesPattern, PathType.GLOB, null, null, true, conf)) {
- if (currentChunkSize > chunkSizeLimit) {
- Closeables.close(dictWriter, false);
- chunkIndex++;
-
- chunkPath = new Path(dictionaryPathBase, name + chunkIndex);
- chunkPaths.add(chunkPath);
-
- dictWriter = new SequenceFile.Writer(fs, conf, chunkPath, Text.class, IntWritable.class);
- currentChunkSize = 0;
- }
-
- Writable key = record.getFirst();
- int fieldSize = DICTIONARY_BYTE_OVERHEAD + key.toString().length() * 2 + Integer.SIZE / 8;
- currentChunkSize += fieldSize;
- dictWriter.append(key, new IntWritable(i++));
- }
- maxTermDimension[0] = i;
- } finally {
- Closeables.close(dictWriter, false);
- }
-
- return chunkPaths;
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecMapper.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecMapper.java
deleted file mode 100644
index 91bbd17..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecMapper.java
+++ /dev/null
@@ -1,101 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.email;
-
-import org.apache.commons.lang3.StringUtils;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.LongWritable;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.Mapper;
-import org.apache.mahout.math.map.OpenObjectIntHashMap;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.IOException;
-
-public final class MailToRecMapper extends Mapper<Text, Text, Text, LongWritable> {
-
- private static final Logger log = LoggerFactory.getLogger(MailToRecMapper.class);
-
- private final OpenObjectIntHashMap<String> fromDictionary = new OpenObjectIntHashMap<>();
- private final OpenObjectIntHashMap<String> msgIdDictionary = new OpenObjectIntHashMap<>();
- private String separator = "\n";
- private int fromIdx;
- private int refsIdx;
-
- public enum Counters {
- REFERENCE, ORIGINAL
- }
-
- @Override
- protected void setup(Context context) throws IOException, InterruptedException {
- super.setup(context);
- Configuration conf = context.getConfiguration();
- String fromPrefix = conf.get(EmailUtility.FROM_PREFIX);
- String msgPrefix = conf.get(EmailUtility.MSG_IDS_PREFIX);
- fromIdx = conf.getInt(EmailUtility.FROM_INDEX, 0);
- refsIdx = conf.getInt(EmailUtility.REFS_INDEX, 1);
- EmailUtility.loadDictionaries(conf, fromPrefix, fromDictionary, msgPrefix, msgIdDictionary);
- log.info("From Dictionary size: {} Msg Id Dictionary size: {}", fromDictionary.size(), msgIdDictionary.size());
- separator = context.getConfiguration().get(EmailUtility.SEPARATOR);
- }
-
- @Override
- protected void map(Text key, Text value, Context context) throws IOException, InterruptedException {
-
- int msgIdKey = Integer.MIN_VALUE;
-
-
- int fromKey = Integer.MIN_VALUE;
- String valStr = value.toString();
- String[] splits = StringUtils.splitByWholeSeparatorPreserveAllTokens(valStr, separator);
-
- if (splits != null && splits.length > 0) {
- if (splits.length > refsIdx) {
- String from = EmailUtility.cleanUpEmailAddress(splits[fromIdx]);
- fromKey = fromDictionary.get(from);
- }
- //get the references
- if (splits.length > refsIdx) {
- String[] theRefs = EmailUtility.parseReferences(splits[refsIdx]);
- if (theRefs != null && theRefs.length > 0) {
- //we have a reference, the first one is the original message id, so map to that one if it exists
- msgIdKey = msgIdDictionary.get(theRefs[0]);
- context.getCounter(Counters.REFERENCE).increment(1);
- }
- }
- }
- //we don't have any references, so use the msg id
- if (msgIdKey == Integer.MIN_VALUE) {
- //get the msg id and the from and output the associated ids
- String keyStr = key.toString();
- int idx = keyStr.lastIndexOf('/');
- if (idx != -1) {
- String msgId = keyStr.substring(idx + 1);
- msgIdKey = msgIdDictionary.get(msgId);
- context.getCounter(Counters.ORIGINAL).increment(1);
- }
- }
-
- if (msgIdKey != Integer.MIN_VALUE && fromKey != Integer.MIN_VALUE) {
- context.write(new Text(fromKey + "," + msgIdKey), new LongWritable(1));
- }
- }
-
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecReducer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecReducer.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecReducer.java
deleted file mode 100644
index ee36a41..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecReducer.java
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.email;
-
-import org.apache.hadoop.io.LongWritable;
-import org.apache.hadoop.io.NullWritable;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.Reducer;
-
-import java.io.IOException;
-
-public class MailToRecReducer extends Reducer<Text, LongWritable, Text, NullWritable> {
- //if true, then output weight
- private boolean useCounts = true;
- /**
- * We can either ignore how many times the user interacted (boolean) or output the number of times they interacted.
- */
- public static final String USE_COUNTS_PREFERENCE = "useBooleanPreferences";
-
- @Override
- protected void setup(Context context) throws IOException, InterruptedException {
- useCounts = context.getConfiguration().getBoolean(USE_COUNTS_PREFERENCE, true);
- }
-
- @Override
- protected void reduce(Text key, Iterable<LongWritable> values, Context context)
- throws IOException, InterruptedException {
- if (useCounts) {
- long sum = 0;
- for (LongWritable value : values) {
- sum++;
- }
- context.write(new Text(key.toString() + ',' + sum), null);
- } else {
- context.write(new Text(key.toString()), null);
- }
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MsgIdToDictionaryMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MsgIdToDictionaryMapper.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MsgIdToDictionaryMapper.java
deleted file mode 100644
index f3de847..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MsgIdToDictionaryMapper.java
+++ /dev/null
@@ -1,49 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.email;
-
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.Mapper;
-import org.apache.mahout.math.VarIntWritable;
-
-import java.io.IOException;
-
-/**
- * Assumes the input is in the format created by {@link org.apache.mahout.text.SequenceFilesFromMailArchives}
- */
-public final class MsgIdToDictionaryMapper extends Mapper<Text, Text, Text, VarIntWritable> {
-
- @Override
- protected void map(Text key, Text value, Context context) throws IOException, InterruptedException {
- //message id is in the key: /201008/AANLkTikvVnhNH+Y5AGEwqd2=***@mail.gmail.com
- String keyStr = key.toString();
- int idx = keyStr.lastIndexOf('@'); //find the last @
- if (idx == -1) {
- context.getCounter(EmailUtility.Counters.NO_MESSAGE_ID).increment(1);
- } else {
- //found the @, now find the last slash before the @ and grab everything after that
- idx = keyStr.lastIndexOf('/', idx);
- String msgId = keyStr.substring(idx + 1);
- if (EmailUtility.WHITESPACE.matcher(msgId).matches()) {
- context.getCounter(EmailUtility.Counters.NO_MESSAGE_ID).increment(1);
- } else {
- context.write(new Text(msgId), new VarIntWritable(1));
- }
- }
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/DataFileIterable.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/DataFileIterable.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/DataFileIterable.java
deleted file mode 100644
index c358021..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/DataFileIterable.java
+++ /dev/null
@@ -1,44 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.kddcup;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.Iterator;
-
-import org.apache.mahout.cf.taste.model.PreferenceArray;
-import org.apache.mahout.common.Pair;
-
-public final class DataFileIterable implements Iterable<Pair<PreferenceArray,long[]>> {
-
- private final File dataFile;
-
- public DataFileIterable(File dataFile) {
- this.dataFile = dataFile;
- }
-
- @Override
- public Iterator<Pair<PreferenceArray, long[]>> iterator() {
- try {
- return new DataFileIterator(dataFile);
- } catch (IOException ioe) {
- throw new IllegalStateException(ioe);
- }
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/DataFileIterator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/DataFileIterator.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/DataFileIterator.java
deleted file mode 100644
index 786e080..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/DataFileIterator.java
+++ /dev/null
@@ -1,158 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.kddcup;
-
-import java.io.Closeable;
-import java.io.File;
-import java.io.IOException;
-import java.util.regex.Pattern;
-
-import com.google.common.collect.AbstractIterator;
-import com.google.common.io.Closeables;
-import org.apache.mahout.cf.taste.impl.common.SkippingIterator;
-import org.apache.mahout.cf.taste.impl.model.GenericUserPreferenceArray;
-import org.apache.mahout.cf.taste.model.PreferenceArray;
-import org.apache.mahout.common.iterator.FileLineIterator;
-import org.apache.mahout.common.Pair;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * <p>An {@link java.util.Iterator} which iterates over any of the KDD Cup's rating files. These include the files
- * {train,test,validation}Idx{1,2}}.txt. See http://kddcup.yahoo.com/. Each element in the iteration corresponds
- * to one user's ratings as a {@link PreferenceArray} and corresponding timestamps as a parallel {@code long}
- * array.</p>
- *
- * <p>Timestamps in the data set are relative to some unknown point in time, for anonymity. They are assumed
- * to be relative to the epoch, time 0, or January 1 1970, for purposes here.</p>
- */
-public final class DataFileIterator
- extends AbstractIterator<Pair<PreferenceArray,long[]>>
- implements SkippingIterator<Pair<PreferenceArray,long[]>>, Closeable {
-
- private static final Pattern COLON_PATTERN = Pattern.compile(":");
- private static final Pattern PIPE_PATTERN = Pattern.compile("\\|");
- private static final Pattern TAB_PATTERN = Pattern.compile("\t");
-
- private final FileLineIterator lineIterator;
-
- private static final Logger log = LoggerFactory.getLogger(DataFileIterator.class);
-
- public DataFileIterator(File dataFile) throws IOException {
- if (dataFile == null || dataFile.isDirectory() || !dataFile.exists()) {
- throw new IllegalArgumentException("Bad data file: " + dataFile);
- }
- lineIterator = new FileLineIterator(dataFile);
- }
-
- @Override
- protected Pair<PreferenceArray, long[]> computeNext() {
-
- if (!lineIterator.hasNext()) {
- return endOfData();
- }
-
- String line = lineIterator.next();
- // First a userID|ratingsCount line
- String[] tokens = PIPE_PATTERN.split(line);
-
- long userID = Long.parseLong(tokens[0]);
- int ratingsLeftToRead = Integer.parseInt(tokens[1]);
- int ratingsRead = 0;
-
- PreferenceArray currentUserPrefs = new GenericUserPreferenceArray(ratingsLeftToRead);
- long[] timestamps = new long[ratingsLeftToRead];
-
- while (ratingsLeftToRead > 0) {
-
- line = lineIterator.next();
-
- // Then a data line. May be 1-4 tokens depending on whether preference info is included (it's not in test data)
- // or whether date info is included (not inluded in track 2). Item ID is always first, and date is the last
- // two fields if it exists.
- tokens = TAB_PATTERN.split(line);
- boolean hasPref = tokens.length == 2 || tokens.length == 4;
- boolean hasDate = tokens.length > 2;
-
- long itemID = Long.parseLong(tokens[0]);
-
- currentUserPrefs.setUserID(0, userID);
- currentUserPrefs.setItemID(ratingsRead, itemID);
- if (hasPref) {
- float preference = Float.parseFloat(tokens[1]);
- currentUserPrefs.setValue(ratingsRead, preference);
- }
-
- if (hasDate) {
- long timestamp;
- if (hasPref) {
- timestamp = parseFakeTimestamp(tokens[2], tokens[3]);
- } else {
- timestamp = parseFakeTimestamp(tokens[1], tokens[2]);
- }
- timestamps[ratingsRead] = timestamp;
- }
-
- ratingsRead++;
- ratingsLeftToRead--;
- }
-
- return new Pair<>(currentUserPrefs, timestamps);
- }
-
- @Override
- public void skip(int n) {
- for (int i = 0; i < n; i++) {
- if (lineIterator.hasNext()) {
- String line = lineIterator.next();
- // First a userID|ratingsCount line
- String[] tokens = PIPE_PATTERN.split(line);
- int linesToSKip = Integer.parseInt(tokens[1]);
- lineIterator.skip(linesToSKip);
- } else {
- break;
- }
- }
- }
-
- @Override
- public void close() {
- endOfData();
- try {
- Closeables.close(lineIterator, true);
- } catch (IOException e) {
- log.error(e.getMessage(), e);
- }
- }
-
- /**
- * @param dateString "date" in days since some undisclosed date, which we will arbitrarily assume to be the
- * epoch, January 1 1970.
- * @param timeString time of day in HH:mm:ss format
- * @return the UNIX timestamp for this moment in time
- */
- private static long parseFakeTimestamp(String dateString, CharSequence timeString) {
- int days = Integer.parseInt(dateString);
- String[] timeTokens = COLON_PATTERN.split(timeString);
- int hours = Integer.parseInt(timeTokens[0]);
- int minutes = Integer.parseInt(timeTokens[1]);
- int seconds = Integer.parseInt(timeTokens[2]);
- return 86400L * days + 3600L + hours + 60L * minutes + seconds;
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/KDDCupDataModel.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/KDDCupDataModel.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/KDDCupDataModel.java
deleted file mode 100644
index 4b62050..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/KDDCupDataModel.java
+++ /dev/null
@@ -1,231 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.kddcup;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.Collection;
-import java.util.Iterator;
-
-import com.google.common.base.Preconditions;
-import org.apache.mahout.cf.taste.common.Refreshable;
-import org.apache.mahout.cf.taste.common.TasteException;
-import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
-import org.apache.mahout.cf.taste.impl.common.FastIDSet;
-import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
-import org.apache.mahout.cf.taste.impl.model.GenericDataModel;
-import org.apache.mahout.cf.taste.model.DataModel;
-import org.apache.mahout.cf.taste.model.PreferenceArray;
-import org.apache.mahout.common.Pair;
-import org.apache.mahout.common.iterator.SamplingIterator;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * <p>An {@link DataModel} which reads into memory any of the KDD Cup's rating files; it is really
- * meant for use with training data in the files trainIdx{1,2}}.txt.
- * See http://kddcup.yahoo.com/.</p>
- *
- * <p>Timestamps in the data set are relative to some unknown point in time, for anonymity. They are assumed
- * to be relative to the epoch, time 0, or January 1 1970, for purposes here.</p>
- */
-public final class KDDCupDataModel implements DataModel {
-
- private static final Logger log = LoggerFactory.getLogger(KDDCupDataModel.class);
-
- private final File dataFileDirectory;
- private final DataModel delegate;
-
- /**
- * @param dataFile training rating file
- */
- public KDDCupDataModel(File dataFile) throws IOException {
- this(dataFile, false, 1.0);
- }
-
- /**
- * @param dataFile training rating file
- * @param storeDates if true, dates are parsed and stored, otherwise not
- * @param samplingRate percentage of users to keep; can be used to reduce memory requirements
- */
- public KDDCupDataModel(File dataFile, boolean storeDates, double samplingRate) throws IOException {
-
- Preconditions.checkArgument(!Double.isNaN(samplingRate) && samplingRate > 0.0 && samplingRate <= 1.0,
- "Must be: 0.0 < samplingRate <= 1.0");
-
- dataFileDirectory = dataFile.getParentFile();
-
- Iterator<Pair<PreferenceArray,long[]>> dataIterator = new DataFileIterator(dataFile);
- if (samplingRate < 1.0) {
- dataIterator = new SamplingIterator<>(dataIterator, samplingRate);
- }
-
- FastByIDMap<PreferenceArray> userData = new FastByIDMap<>();
- FastByIDMap<FastByIDMap<Long>> timestamps = new FastByIDMap<>();
-
- while (dataIterator.hasNext()) {
-
- Pair<PreferenceArray,long[]> pair = dataIterator.next();
- PreferenceArray userPrefs = pair.getFirst();
- long[] timestampsForPrefs = pair.getSecond();
-
- userData.put(userPrefs.getUserID(0), userPrefs);
- if (storeDates) {
- FastByIDMap<Long> itemTimestamps = new FastByIDMap<>();
- for (int i = 0; i < timestampsForPrefs.length; i++) {
- long timestamp = timestampsForPrefs[i];
- if (timestamp > 0L) {
- itemTimestamps.put(userPrefs.getItemID(i), timestamp);
- }
- }
- }
-
- }
-
- if (storeDates) {
- delegate = new GenericDataModel(userData, timestamps);
- } else {
- delegate = new GenericDataModel(userData);
- }
-
- Runtime runtime = Runtime.getRuntime();
- log.info("Loaded data model in about {}MB heap", (runtime.totalMemory() - runtime.freeMemory()) / 1000000);
- }
-
- public File getDataFileDirectory() {
- return dataFileDirectory;
- }
-
- public static File getTrainingFile(File dataFileDirectory) {
- return getFile(dataFileDirectory, "trainIdx");
- }
-
- public static File getValidationFile(File dataFileDirectory) {
- return getFile(dataFileDirectory, "validationIdx");
- }
-
- public static File getTestFile(File dataFileDirectory) {
- return getFile(dataFileDirectory, "testIdx");
- }
-
- public static File getTrackFile(File dataFileDirectory) {
- return getFile(dataFileDirectory, "trackData");
- }
-
- private static File getFile(File dataFileDirectory, String prefix) {
- // Works on set 1 or 2
- for (int set : new int[] {1,2}) {
- // Works on sample data from before contest or real data
- for (String firstLinesOrNot : new String[] {"", ".firstLines"}) {
- for (String gzippedOrNot : new String[] {".gz", ""}) {
- File dataFile = new File(dataFileDirectory, prefix + set + firstLinesOrNot + ".txt" + gzippedOrNot);
- if (dataFile.exists()) {
- return dataFile;
- }
- }
- }
- }
- throw new IllegalArgumentException("Can't find " + prefix + " file in " + dataFileDirectory);
- }
-
- @Override
- public LongPrimitiveIterator getUserIDs() throws TasteException {
- return delegate.getUserIDs();
- }
-
- @Override
- public PreferenceArray getPreferencesFromUser(long userID) throws TasteException {
- return delegate.getPreferencesFromUser(userID);
- }
-
- @Override
- public FastIDSet getItemIDsFromUser(long userID) throws TasteException {
- return delegate.getItemIDsFromUser(userID);
- }
-
- @Override
- public LongPrimitiveIterator getItemIDs() throws TasteException {
- return delegate.getItemIDs();
- }
-
- @Override
- public PreferenceArray getPreferencesForItem(long itemID) throws TasteException {
- return delegate.getPreferencesForItem(itemID);
- }
-
- @Override
- public Float getPreferenceValue(long userID, long itemID) throws TasteException {
- return delegate.getPreferenceValue(userID, itemID);
- }
-
- @Override
- public Long getPreferenceTime(long userID, long itemID) throws TasteException {
- return delegate.getPreferenceTime(userID, itemID);
- }
-
- @Override
- public int getNumItems() throws TasteException {
- return delegate.getNumItems();
- }
-
- @Override
- public int getNumUsers() throws TasteException {
- return delegate.getNumUsers();
- }
-
- @Override
- public int getNumUsersWithPreferenceFor(long itemID) throws TasteException {
- return delegate.getNumUsersWithPreferenceFor(itemID);
- }
-
- @Override
- public int getNumUsersWithPreferenceFor(long itemID1, long itemID2) throws TasteException {
- return delegate.getNumUsersWithPreferenceFor(itemID1, itemID2);
- }
-
- @Override
- public void setPreference(long userID, long itemID, float value) throws TasteException {
- delegate.setPreference(userID, itemID, value);
- }
-
- @Override
- public void removePreference(long userID, long itemID) throws TasteException {
- delegate.removePreference(userID, itemID);
- }
-
- @Override
- public boolean hasPreferenceValues() {
- return delegate.hasPreferenceValues();
- }
-
- @Override
- public float getMaxPreference() {
- return 100.0f;
- }
-
- @Override
- public float getMinPreference() {
- return 0.0f;
- }
-
- @Override
- public void refresh(Collection<Refreshable> alreadyRefreshed) {
- // do nothing
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/ToCSV.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/ToCSV.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/ToCSV.java
deleted file mode 100644
index 3f4a732..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/ToCSV.java
+++ /dev/null
@@ -1,77 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.kddcup;
-
-import org.apache.commons.io.Charsets;
-import org.apache.mahout.cf.taste.model.PreferenceArray;
-import org.apache.mahout.common.Pair;
-
-import java.io.BufferedWriter;
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.OutputStream;
-import java.io.OutputStreamWriter;
-import java.io.Writer;
-import java.util.zip.GZIPOutputStream;
-
-/**
- * <p>This class converts a KDD Cup input file into a compressed CSV format. The output format is
- * {@code userID,itemID,score,timestamp}. It can optionally restrict its output to exclude
- * score and/or timestamp.</p>
- *
- * <p>Run as: {@code ToCSV (input file) (output file) [num columns to output]}</p>
- */
-public final class ToCSV {
-
- private ToCSV() {
- }
-
- public static void main(String[] args) throws Exception {
-
- File inputFile = new File(args[0]);
- File outputFile = new File(args[1]);
- int columnsToOutput = 4;
- if (args.length >= 3) {
- columnsToOutput = Integer.parseInt(args[2]);
- }
-
- OutputStream outStream = new GZIPOutputStream(new FileOutputStream(outputFile));
-
- try (Writer outWriter = new BufferedWriter(new OutputStreamWriter(outStream, Charsets.UTF_8))){
- for (Pair<PreferenceArray,long[]> user : new DataFileIterable(inputFile)) {
- PreferenceArray prefs = user.getFirst();
- long[] timestamps = user.getSecond();
- for (int i = 0; i < prefs.length(); i++) {
- outWriter.write(String.valueOf(prefs.getUserID(i)));
- outWriter.write(',');
- outWriter.write(String.valueOf(prefs.getItemID(i)));
- if (columnsToOutput > 2) {
- outWriter.write(',');
- outWriter.write(String.valueOf(prefs.getValue(i)));
- }
- if (columnsToOutput > 3) {
- outWriter.write(',');
- outWriter.write(String.valueOf(timestamps[i]));
- }
- outWriter.write('\n');
- }
- }
- }
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/EstimateConverter.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/EstimateConverter.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/EstimateConverter.java
deleted file mode 100644
index 0112ab9..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/EstimateConverter.java
+++ /dev/null
@@ -1,43 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.kddcup.track1;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-public final class EstimateConverter {
-
- private static final Logger log = LoggerFactory.getLogger(EstimateConverter.class);
-
- private EstimateConverter() {}
-
- public static byte convert(double estimate, long userID, long itemID) {
- if (Double.isNaN(estimate)) {
- log.warn("Unable to compute estimate for user {}, item {}", userID, itemID);
- return 0x7F;
- } else {
- int scaledEstimate = (int) (estimate * 2.55);
- if (scaledEstimate > 255) {
- scaledEstimate = 255;
- } else if (scaledEstimate < 0) {
- scaledEstimate = 0;
- }
- return (byte) scaledEstimate;
- }
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1Callable.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1Callable.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1Callable.java
deleted file mode 100644
index 72056da..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1Callable.java
+++ /dev/null
@@ -1,67 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.kddcup.track1;
-
-import java.util.concurrent.Callable;
-import java.util.concurrent.atomic.AtomicInteger;
-
-import org.apache.mahout.cf.taste.common.NoSuchItemException;
-import org.apache.mahout.cf.taste.common.TasteException;
-import org.apache.mahout.cf.taste.model.PreferenceArray;
-import org.apache.mahout.cf.taste.recommender.Recommender;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-final class Track1Callable implements Callable<byte[]> {
-
- private static final Logger log = LoggerFactory.getLogger(Track1Callable.class);
- private static final AtomicInteger COUNT = new AtomicInteger();
-
- private final Recommender recommender;
- private final PreferenceArray userTest;
-
- Track1Callable(Recommender recommender, PreferenceArray userTest) {
- this.recommender = recommender;
- this.userTest = userTest;
- }
-
- @Override
- public byte[] call() throws TasteException {
- long userID = userTest.get(0).getUserID();
- byte[] result = new byte[userTest.length()];
- for (int i = 0; i < userTest.length(); i++) {
- long itemID = userTest.getItemID(i);
- double estimate;
- try {
- estimate = recommender.estimatePreference(userID, itemID);
- } catch (NoSuchItemException nsie) {
- // OK in the sample data provided before the contest, should never happen otherwise
- log.warn("Unknown item {}; OK unless this is the real contest data", itemID);
- continue;
- }
- result[i] = EstimateConverter.convert(estimate, userID, itemID);
- }
-
- if (COUNT.incrementAndGet() % 10000 == 0) {
- log.info("Completed {} users", COUNT.get());
- }
-
- return result;
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1Recommender.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1Recommender.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1Recommender.java
deleted file mode 100644
index 067daf5..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1Recommender.java
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.kddcup.track1;
-
-import java.util.Collection;
-import java.util.List;
-
-import org.apache.mahout.cf.taste.common.Refreshable;
-import org.apache.mahout.cf.taste.common.TasteException;
-import org.apache.mahout.cf.taste.impl.recommender.GenericItemBasedRecommender;
-import org.apache.mahout.cf.taste.impl.similarity.UncenteredCosineSimilarity;
-import org.apache.mahout.cf.taste.model.DataModel;
-import org.apache.mahout.cf.taste.recommender.IDRescorer;
-import org.apache.mahout.cf.taste.recommender.RecommendedItem;
-import org.apache.mahout.cf.taste.recommender.Recommender;
-import org.apache.mahout.cf.taste.similarity.ItemSimilarity;
-
-public final class Track1Recommender implements Recommender {
-
- private final Recommender recommender;
-
- public Track1Recommender(DataModel dataModel) throws TasteException {
- // Change this to whatever you like!
- ItemSimilarity similarity = new UncenteredCosineSimilarity(dataModel);
- recommender = new GenericItemBasedRecommender(dataModel, similarity);
- }
-
- @Override
- public List<RecommendedItem> recommend(long userID, int howMany) throws TasteException {
- return recommender.recommend(userID, howMany);
- }
-
- @Override
- public List<RecommendedItem> recommend(long userID, int howMany, boolean includeKnownItems) throws TasteException {
- return recommend(userID, howMany, null, includeKnownItems);
- }
-
- @Override
- public List<RecommendedItem> recommend(long userID, int howMany, IDRescorer rescorer) throws TasteException {
- return recommender.recommend(userID, howMany, rescorer, false);
- }
-
- @Override
- public List<RecommendedItem> recommend(long userID, int howMany, IDRescorer rescorer, boolean includeKnownItems)
- throws TasteException {
- return recommender.recommend(userID, howMany, rescorer, includeKnownItems);
- }
-
- @Override
- public float estimatePreference(long userID, long itemID) throws TasteException {
- return recommender.estimatePreference(userID, itemID);
- }
-
- @Override
- public void setPreference(long userID, long itemID, float value) throws TasteException {
- recommender.setPreference(userID, itemID, value);
- }
-
- @Override
- public void removePreference(long userID, long itemID) throws TasteException {
- recommender.removePreference(userID, itemID);
- }
-
- @Override
- public DataModel getDataModel() {
- return recommender.getDataModel();
- }
-
- @Override
- public void refresh(Collection<Refreshable> alreadyRefreshed) {
- recommender.refresh(alreadyRefreshed);
- }
-
- @Override
- public String toString() {
- return "Track1Recommender[recommender:" + recommender + ']';
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1RecommenderBuilder.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1RecommenderBuilder.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1RecommenderBuilder.java
deleted file mode 100644
index 6b9fe1b..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1RecommenderBuilder.java
+++ /dev/null
@@ -1,32 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.kddcup.track1;
-
-import org.apache.mahout.cf.taste.common.TasteException;
-import org.apache.mahout.cf.taste.eval.RecommenderBuilder;
-import org.apache.mahout.cf.taste.model.DataModel;
-import org.apache.mahout.cf.taste.recommender.Recommender;
-
-final class Track1RecommenderBuilder implements RecommenderBuilder {
-
- @Override
- public Recommender buildRecommender(DataModel dataModel) throws TasteException {
- return new Track1Recommender(dataModel);
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1RecommenderEvaluator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1RecommenderEvaluator.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1RecommenderEvaluator.java
deleted file mode 100644
index bcd0a3d..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1RecommenderEvaluator.java
+++ /dev/null
@@ -1,108 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.kddcup.track1;
-
-import java.io.File;
-import java.util.Collection;
-import java.util.concurrent.Callable;
-import java.util.concurrent.atomic.AtomicInteger;
-
-import com.google.common.collect.Lists;
-import org.apache.mahout.cf.taste.common.TasteException;
-import org.apache.mahout.cf.taste.eval.DataModelBuilder;
-import org.apache.mahout.cf.taste.eval.RecommenderBuilder;
-import org.apache.mahout.cf.taste.example.kddcup.DataFileIterable;
-import org.apache.mahout.cf.taste.example.kddcup.KDDCupDataModel;
-import org.apache.mahout.cf.taste.impl.common.FullRunningAverage;
-import org.apache.mahout.cf.taste.impl.common.FullRunningAverageAndStdDev;
-import org.apache.mahout.cf.taste.impl.common.RunningAverage;
-import org.apache.mahout.cf.taste.impl.common.RunningAverageAndStdDev;
-import org.apache.mahout.cf.taste.impl.eval.AbstractDifferenceRecommenderEvaluator;
-import org.apache.mahout.cf.taste.model.DataModel;
-import org.apache.mahout.cf.taste.model.Preference;
-import org.apache.mahout.cf.taste.model.PreferenceArray;
-import org.apache.mahout.cf.taste.recommender.Recommender;
-import org.apache.mahout.common.Pair;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * Attempts to run an evaluation just like that dictated for Yahoo's KDD Cup, Track 1.
- * It will compute the RMSE of a validation data set against the predicted ratings from
- * the training data set.
- */
-public final class Track1RecommenderEvaluator extends AbstractDifferenceRecommenderEvaluator {
-
- private static final Logger log = LoggerFactory.getLogger(Track1RecommenderEvaluator.class);
-
- private RunningAverage average;
- private final File dataFileDirectory;
-
- public Track1RecommenderEvaluator(File dataFileDirectory) {
- setMaxPreference(100.0f);
- setMinPreference(0.0f);
- average = new FullRunningAverage();
- this.dataFileDirectory = dataFileDirectory;
- }
-
- @Override
- public double evaluate(RecommenderBuilder recommenderBuilder,
- DataModelBuilder dataModelBuilder,
- DataModel dataModel,
- double trainingPercentage,
- double evaluationPercentage) throws TasteException {
-
- Recommender recommender = recommenderBuilder.buildRecommender(dataModel);
-
- Collection<Callable<Void>> estimateCallables = Lists.newArrayList();
- AtomicInteger noEstimateCounter = new AtomicInteger();
- for (Pair<PreferenceArray,long[]> userData
- : new DataFileIterable(KDDCupDataModel.getValidationFile(dataFileDirectory))) {
- PreferenceArray validationPrefs = userData.getFirst();
- long userID = validationPrefs.get(0).getUserID();
- estimateCallables.add(
- new PreferenceEstimateCallable(recommender, userID, validationPrefs, noEstimateCounter));
- }
-
- RunningAverageAndStdDev timing = new FullRunningAverageAndStdDev();
- execute(estimateCallables, noEstimateCounter, timing);
-
- double result = computeFinalEvaluation();
- log.info("Evaluation result: {}", result);
- return result;
- }
-
- // Use RMSE scoring:
-
- @Override
- protected void reset() {
- average = new FullRunningAverage();
- }
-
- @Override
- protected void processOneEstimate(float estimatedPreference, Preference realPref) {
- double diff = realPref.getValue() - estimatedPreference;
- average.addDatum(diff * diff);
- }
-
- @Override
- protected double computeFinalEvaluation() {
- return Math.sqrt(average.getAverage());
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1RecommenderEvaluatorRunner.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1RecommenderEvaluatorRunner.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1RecommenderEvaluatorRunner.java
deleted file mode 100644
index deadc00..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1RecommenderEvaluatorRunner.java
+++ /dev/null
@@ -1,56 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.kddcup.track1;
-
-import java.io.File;
-import java.io.IOException;
-
-import org.apache.commons.cli2.OptionException;
-import org.apache.mahout.cf.taste.common.TasteException;
-import org.apache.mahout.cf.taste.example.TasteOptionParser;
-import org.apache.mahout.cf.taste.example.kddcup.KDDCupDataModel;
-import org.apache.mahout.cf.taste.model.DataModel;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-public final class Track1RecommenderEvaluatorRunner {
-
- private static final Logger log = LoggerFactory.getLogger(Track1RecommenderEvaluatorRunner.class);
-
- private Track1RecommenderEvaluatorRunner() {
- }
-
- public static void main(String... args) throws IOException, TasteException, OptionException {
- File dataFileDirectory = TasteOptionParser.getRatings(args);
- if (dataFileDirectory == null) {
- throw new IllegalArgumentException("No data directory");
- }
- if (!dataFileDirectory.exists() || !dataFileDirectory.isDirectory()) {
- throw new IllegalArgumentException("Bad data file directory: " + dataFileDirectory);
- }
- Track1RecommenderEvaluator evaluator = new Track1RecommenderEvaluator(dataFileDirectory);
- DataModel model = new KDDCupDataModel(KDDCupDataModel.getTrainingFile(dataFileDirectory));
- double evaluation = evaluator.evaluate(new Track1RecommenderBuilder(),
- null,
- model,
- Float.NaN,
- Float.NaN);
- log.info(String.valueOf(evaluation));
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1Runner.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1Runner.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1Runner.java
deleted file mode 100644
index a0ff126..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/Track1Runner.java
+++ /dev/null
@@ -1,95 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.kddcup.track1;
-
-import org.apache.mahout.cf.taste.example.kddcup.DataFileIterable;
-import org.apache.mahout.cf.taste.example.kddcup.KDDCupDataModel;
-import org.apache.mahout.cf.taste.model.PreferenceArray;
-import org.apache.mahout.common.Pair;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.BufferedOutputStream;
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.OutputStream;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.List;
-import java.util.concurrent.ExecutorService;
-import java.util.concurrent.Executors;
-import java.util.concurrent.Future;
-
-/**
- * <p>Runs "track 1" of the KDD Cup competition using whatever recommender is inside {@link Track1Recommender}
- * and attempts to output the result in the correct contest format.</p>
- *
- * <p>Run as: {@code Track1Runner [track 1 data file directory] [output file]}</p>
- */
-public final class Track1Runner {
-
- private static final Logger log = LoggerFactory.getLogger(Track1Runner.class);
-
- private Track1Runner() {
- }
-
- public static void main(String[] args) throws Exception {
-
- File dataFileDirectory = new File(args[0]);
- if (!dataFileDirectory.exists() || !dataFileDirectory.isDirectory()) {
- throw new IllegalArgumentException("Bad data file directory: " + dataFileDirectory);
- }
-
- long start = System.currentTimeMillis();
-
- KDDCupDataModel model = new KDDCupDataModel(KDDCupDataModel.getTrainingFile(dataFileDirectory));
- Track1Recommender recommender = new Track1Recommender(model);
-
- long end = System.currentTimeMillis();
- log.info("Loaded model in {}s", (end - start) / 1000);
- start = end;
-
- Collection<Track1Callable> callables = new ArrayList<>();
- for (Pair<PreferenceArray,long[]> tests : new DataFileIterable(KDDCupDataModel.getTestFile(dataFileDirectory))) {
- PreferenceArray userTest = tests.getFirst();
- callables.add(new Track1Callable(recommender, userTest));
- }
-
- int cores = Runtime.getRuntime().availableProcessors();
- log.info("Running on {} cores", cores);
- ExecutorService executor = Executors.newFixedThreadPool(cores);
- List<Future<byte[]>> results = executor.invokeAll(callables);
- executor.shutdown();
-
- end = System.currentTimeMillis();
- log.info("Ran recommendations in {}s", (end - start) / 1000);
- start = end;
-
- try (OutputStream out = new BufferedOutputStream(new FileOutputStream(new File(args[1])))){
- for (Future<byte[]> result : results) {
- for (byte estimate : result.get()) {
- out.write(estimate);
- }
- }
- }
-
- end = System.currentTimeMillis();
- log.info("Wrote output in {}s", (end - start) / 1000);
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/DataModelFactorizablePreferences.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/DataModelFactorizablePreferences.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/DataModelFactorizablePreferences.java
deleted file mode 100644
index 022d78c..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/DataModelFactorizablePreferences.java
+++ /dev/null
@@ -1,107 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.kddcup.track1.svd;
-
-import org.apache.mahout.cf.taste.common.TasteException;
-import org.apache.mahout.cf.taste.impl.common.FastIDSet;
-import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
-import org.apache.mahout.cf.taste.impl.model.GenericPreference;
-import org.apache.mahout.cf.taste.model.DataModel;
-import org.apache.mahout.cf.taste.model.Preference;
-
-import java.util.ArrayList;
-import java.util.List;
-
-/**
- * can be used to drop {@link DataModel}s into {@link ParallelArraysSGDFactorizer}
- */
-public class DataModelFactorizablePreferences implements FactorizablePreferences {
-
- private final FastIDSet userIDs;
- private final FastIDSet itemIDs;
-
- private final List<Preference> preferences;
-
- private final float minPreference;
- private final float maxPreference;
-
- public DataModelFactorizablePreferences(DataModel dataModel) {
-
- minPreference = dataModel.getMinPreference();
- maxPreference = dataModel.getMaxPreference();
-
- try {
- userIDs = new FastIDSet(dataModel.getNumUsers());
- itemIDs = new FastIDSet(dataModel.getNumItems());
- preferences = new ArrayList<>();
-
- LongPrimitiveIterator userIDsIterator = dataModel.getUserIDs();
- while (userIDsIterator.hasNext()) {
- long userID = userIDsIterator.nextLong();
- userIDs.add(userID);
- for (Preference preference : dataModel.getPreferencesFromUser(userID)) {
- itemIDs.add(preference.getItemID());
- preferences.add(new GenericPreference(userID, preference.getItemID(), preference.getValue()));
- }
- }
- } catch (TasteException te) {
- throw new IllegalStateException("Unable to create factorizable preferences!", te);
- }
- }
-
- @Override
- public LongPrimitiveIterator getUserIDs() {
- return userIDs.iterator();
- }
-
- @Override
- public LongPrimitiveIterator getItemIDs() {
- return itemIDs.iterator();
- }
-
- @Override
- public Iterable<Preference> getPreferences() {
- return preferences;
- }
-
- @Override
- public float getMinPreference() {
- return minPreference;
- }
-
- @Override
- public float getMaxPreference() {
- return maxPreference;
- }
-
- @Override
- public int numUsers() {
- return userIDs.size();
- }
-
- @Override
- public int numItems() {
- return itemIDs.size();
- }
-
- @Override
- public int numPreferences() {
- return preferences.size();
- }
-}
-

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/FactorizablePreferences.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/FactorizablePreferences.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/FactorizablePreferences.java
deleted file mode 100644
index a126dec..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/FactorizablePreferences.java
+++ /dev/null
@@ -1,44 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.kddcup.track1.svd;
-
-import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
-import org.apache.mahout.cf.taste.model.Preference;
-
-/**
- * models the necessary input for {@link ParallelArraysSGDFactorizer}
- */
-public interface FactorizablePreferences {
-
- LongPrimitiveIterator getUserIDs();
-
- LongPrimitiveIterator getItemIDs();
-
- Iterable<Preference> getPreferences();
-
- float getMinPreference();
-
- float getMaxPreference();
-
- int numUsers();
-
- int numItems();
-
- int numPreferences();
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/KDDCupFactorizablePreferences.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/KDDCupFactorizablePreferences.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/KDDCupFactorizablePreferences.java
deleted file mode 100644
index 6dcef6b..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/kddcup/track1/svd/KDDCupFactorizablePreferences.java
+++ /dev/null
@@ -1,123 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.kddcup.track1.svd;
-
-import com.google.common.base.Function;
-import com.google.common.collect.Iterables;
-import org.apache.mahout.cf.taste.example.kddcup.DataFileIterable;
-import org.apache.mahout.cf.taste.impl.common.AbstractLongPrimitiveIterator;
-import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
-import org.apache.mahout.cf.taste.model.Preference;
-import org.apache.mahout.cf.taste.model.PreferenceArray;
-import org.apache.mahout.common.Pair;
-
-import java.io.File;
-
-public class KDDCupFactorizablePreferences implements FactorizablePreferences {
-
- private final File dataFile;
-
- public KDDCupFactorizablePreferences(File dataFile) {
- this.dataFile = dataFile;
- }
-
- @Override
- public LongPrimitiveIterator getUserIDs() {
- return new FixedSizeLongIterator(numUsers());
- }
-
- @Override
- public LongPrimitiveIterator getItemIDs() {
- return new FixedSizeLongIterator(numItems());
- }
-
- @Override
- public Iterable<Preference> getPreferences() {
- Iterable<Iterable<Preference>> prefIterators =
- Iterables.transform(new DataFileIterable(dataFile),
- new Function<Pair<PreferenceArray,long[]>,Iterable<Preference>>() {
- @Override
- public Iterable<Preference> apply(Pair<PreferenceArray,long[]> from) {
- return from.getFirst();
- }
- });
- return Iterables.concat(prefIterators);
- }
-
- @Override
- public float getMinPreference() {
- return 0;
- }
-
- @Override
- public float getMaxPreference() {
- return 100;
- }
-
- @Override
- public int numUsers() {
- return 1000990;
- }
-
- @Override
- public int numItems() {
- return 624961;
- }
-
- @Override
- public int numPreferences() {
- return 252800275;
- }
-
- static class FixedSizeLongIterator extends AbstractLongPrimitiveIterator {
-
- private long currentValue;
- private final long maximum;
-
- FixedSizeLongIterator(long maximum) {
- this.maximum = maximum;
- currentValue = 0;
- }
-
- @Override
- public long nextLong() {
- return currentValue++;
- }
-
- @Override
- public long peek() {
- return currentValue;
- }
-
- @Override
- public void skip(int n) {
- currentValue += n;
- }
-
- @Override
- public boolean hasNext() {
- return currentValue < maximum;
- }
-
- @Override
- public void remove() {
- throw new UnsupportedOperationException();
- }
- }
-
-}
r***@apache.org
2018-06-28 14:55:17 UTC
Permalink
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/bin/resources/bank-full.csv
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/bin/resources/bank-full.csv b/community/mahout-mr/examples/bin/resources/bank-full.csv
deleted file mode 100644
index d7a2ede..0000000
--- a/community/mahout-mr/examples/bin/resources/bank-full.csv
+++ /dev/null
@@ -1,45212 +0,0 @@
-"age";"job";"marital";"education";"default";"balance";"housing";"loan";"contact";"day";"month";"duration";"campaign";"pdays";"previous";"poutcome";"y"
-58;"management";"married";"tertiary";"no";2143;"yes";"no";"unknown";5;"may";261;1;-1;0;"unknown";"no"
-44;"technician";"single";"secondary";"no";29;"yes";"no";"unknown";5;"may";151;1;-1;0;"unknown";"no"
-33;"entrepreneur";"married";"secondary";"no";2;"yes";"yes";"unknown";5;"may";76;1;-1;0;"unknown";"no"
-47;"blue-collar";"married";"unknown";"no";1506;"yes";"no";"unknown";5;"may";92;1;-1;0;"unknown";"no"
-33;"unknown";"single";"unknown";"no";1;"no";"no";"unknown";5;"may";198;1;-1;0;"unknown";"no"
-35;"management";"married";"tertiary";"no";231;"yes";"no";"unknown";5;"may";139;1;-1;0;"unknown";"no"
-28;"management";"single";"tertiary";"no";447;"yes";"yes";"unknown";5;"may";217;1;-1;0;"unknown";"no"
-42;"entrepreneur";"divorced";"tertiary";"yes";2;"yes";"no";"unknown";5;"may";380;1;-1;0;"unknown";"no"
-58;"retired";"married";"primary";"no";121;"yes";"no";"unknown";5;"may";50;1;-1;0;"unknown";"no"
-43;"technician";"single";"secondary";"no";593;"yes";"no";"unknown";5;"may";55;1;-1;0;"unknown";"no"
-41;"admin.";"divorced";"secondary";"no";270;"yes";"no";"unknown";5;"may";222;1;-1;0;"unknown";"no"
-29;"admin.";"single";"secondary";"no";390;"yes";"no";"unknown";5;"may";137;1;-1;0;"unknown";"no"
-53;"technician";"married";"secondary";"no";6;"yes";"no";"unknown";5;"may";517;1;-1;0;"unknown";"no"
-58;"technician";"married";"unknown";"no";71;"yes";"no";"unknown";5;"may";71;1;-1;0;"unknown";"no"
-57;"services";"married";"secondary";"no";162;"yes";"no";"unknown";5;"may";174;1;-1;0;"unknown";"no"
-51;"retired";"married";"primary";"no";229;"yes";"no";"unknown";5;"may";353;1;-1;0;"unknown";"no"
-45;"admin.";"single";"unknown";"no";13;"yes";"no";"unknown";5;"may";98;1;-1;0;"unknown";"no"
-57;"blue-collar";"married";"primary";"no";52;"yes";"no";"unknown";5;"may";38;1;-1;0;"unknown";"no"
-60;"retired";"married";"primary";"no";60;"yes";"no";"unknown";5;"may";219;1;-1;0;"unknown";"no"
-33;"services";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";54;1;-1;0;"unknown";"no"
-28;"blue-collar";"married";"secondary";"no";723;"yes";"yes";"unknown";5;"may";262;1;-1;0;"unknown";"no"
-56;"management";"married";"tertiary";"no";779;"yes";"no";"unknown";5;"may";164;1;-1;0;"unknown";"no"
-32;"blue-collar";"single";"primary";"no";23;"yes";"yes";"unknown";5;"may";160;1;-1;0;"unknown";"no"
-25;"services";"married";"secondary";"no";50;"yes";"no";"unknown";5;"may";342;1;-1;0;"unknown";"no"
-40;"retired";"married";"primary";"no";0;"yes";"yes";"unknown";5;"may";181;1;-1;0;"unknown";"no"
-44;"admin.";"married";"secondary";"no";-372;"yes";"no";"unknown";5;"may";172;1;-1;0;"unknown";"no"
-39;"management";"single";"tertiary";"no";255;"yes";"no";"unknown";5;"may";296;1;-1;0;"unknown";"no"
-52;"entrepreneur";"married";"secondary";"no";113;"yes";"yes";"unknown";5;"may";127;1;-1;0;"unknown";"no"
-46;"management";"single";"secondary";"no";-246;"yes";"no";"unknown";5;"may";255;2;-1;0;"unknown";"no"
-36;"technician";"single";"secondary";"no";265;"yes";"yes";"unknown";5;"may";348;1;-1;0;"unknown";"no"
-57;"technician";"married";"secondary";"no";839;"no";"yes";"unknown";5;"may";225;1;-1;0;"unknown";"no"
-49;"management";"married";"tertiary";"no";378;"yes";"no";"unknown";5;"may";230;1;-1;0;"unknown";"no"
-60;"admin.";"married";"secondary";"no";39;"yes";"yes";"unknown";5;"may";208;1;-1;0;"unknown";"no"
-59;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";226;1;-1;0;"unknown";"no"
-51;"management";"married";"tertiary";"no";10635;"yes";"no";"unknown";5;"may";336;1;-1;0;"unknown";"no"
-57;"technician";"divorced";"secondary";"no";63;"yes";"no";"unknown";5;"may";242;1;-1;0;"unknown";"no"
-25;"blue-collar";"married";"secondary";"no";-7;"yes";"no";"unknown";5;"may";365;1;-1;0;"unknown";"no"
-53;"technician";"married";"secondary";"no";-3;"no";"no";"unknown";5;"may";1666;1;-1;0;"unknown";"no"
-36;"admin.";"divorced";"secondary";"no";506;"yes";"no";"unknown";5;"may";577;1;-1;0;"unknown";"no"
-37;"admin.";"single";"secondary";"no";0;"yes";"no";"unknown";5;"may";137;1;-1;0;"unknown";"no"
-44;"services";"divorced";"secondary";"no";2586;"yes";"no";"unknown";5;"may";160;1;-1;0;"unknown";"no"
-50;"management";"married";"secondary";"no";49;"yes";"no";"unknown";5;"may";180;2;-1;0;"unknown";"no"
-60;"blue-collar";"married";"unknown";"no";104;"yes";"no";"unknown";5;"may";22;1;-1;0;"unknown";"no"
-54;"retired";"married";"secondary";"no";529;"yes";"no";"unknown";5;"may";1492;1;-1;0;"unknown";"no"
-58;"retired";"married";"unknown";"no";96;"yes";"no";"unknown";5;"may";616;1;-1;0;"unknown";"no"
-36;"admin.";"single";"primary";"no";-171;"yes";"no";"unknown";5;"may";242;1;-1;0;"unknown";"no"
-58;"self-employed";"married";"tertiary";"no";-364;"yes";"no";"unknown";5;"may";355;1;-1;0;"unknown";"no"
-44;"technician";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";225;2;-1;0;"unknown";"no"
-55;"technician";"divorced";"secondary";"no";0;"no";"no";"unknown";5;"may";160;1;-1;0;"unknown";"no"
-29;"management";"single";"tertiary";"no";0;"yes";"no";"unknown";5;"may";363;1;-1;0;"unknown";"no"
-54;"blue-collar";"married";"secondary";"no";1291;"yes";"no";"unknown";5;"may";266;1;-1;0;"unknown";"no"
-48;"management";"divorced";"tertiary";"no";-244;"yes";"no";"unknown";5;"may";253;1;-1;0;"unknown";"no"
-32;"management";"married";"tertiary";"no";0;"yes";"no";"unknown";5;"may";179;1;-1;0;"unknown";"no"
-42;"admin.";"single";"secondary";"no";-76;"yes";"no";"unknown";5;"may";787;1;-1;0;"unknown";"no"
-24;"technician";"single";"secondary";"no";-103;"yes";"yes";"unknown";5;"may";145;1;-1;0;"unknown";"no"
-38;"entrepreneur";"single";"tertiary";"no";243;"no";"yes";"unknown";5;"may";174;1;-1;0;"unknown";"no"
-38;"management";"single";"tertiary";"no";424;"yes";"no";"unknown";5;"may";104;1;-1;0;"unknown";"no"
-47;"blue-collar";"married";"unknown";"no";306;"yes";"no";"unknown";5;"may";13;1;-1;0;"unknown";"no"
-40;"blue-collar";"single";"unknown";"no";24;"yes";"no";"unknown";5;"may";185;1;-1;0;"unknown";"no"
-46;"services";"married";"primary";"no";179;"yes";"no";"unknown";5;"may";1778;1;-1;0;"unknown";"no"
-32;"admin.";"married";"tertiary";"no";0;"yes";"no";"unknown";5;"may";138;1;-1;0;"unknown";"no"
-53;"technician";"divorced";"secondary";"no";989;"yes";"no";"unknown";5;"may";812;1;-1;0;"unknown";"no"
-57;"blue-collar";"married";"primary";"no";249;"yes";"no";"unknown";5;"may";164;1;-1;0;"unknown";"no"
-33;"services";"married";"secondary";"no";790;"yes";"no";"unknown";5;"may";391;1;-1;0;"unknown";"no"
-49;"blue-collar";"married";"unknown";"no";154;"yes";"no";"unknown";5;"may";357;1;-1;0;"unknown";"no"
-51;"management";"married";"tertiary";"no";6530;"yes";"no";"unknown";5;"may";91;1;-1;0;"unknown";"no"
-60;"retired";"married";"tertiary";"no";100;"no";"no";"unknown";5;"may";528;1;-1;0;"unknown";"no"
-59;"management";"divorced";"tertiary";"no";59;"yes";"no";"unknown";5;"may";273;1;-1;0;"unknown";"no"
-55;"technician";"married";"secondary";"no";1205;"yes";"no";"unknown";5;"may";158;2;-1;0;"unknown";"no"
-35;"blue-collar";"single";"secondary";"no";12223;"yes";"yes";"unknown";5;"may";177;1;-1;0;"unknown";"no"
-57;"blue-collar";"married";"secondary";"no";5935;"yes";"yes";"unknown";5;"may";258;1;-1;0;"unknown";"no"
-31;"services";"married";"secondary";"no";25;"yes";"yes";"unknown";5;"may";172;1;-1;0;"unknown";"no"
-54;"management";"married";"secondary";"no";282;"yes";"yes";"unknown";5;"may";154;1;-1;0;"unknown";"no"
-55;"blue-collar";"married";"primary";"no";23;"yes";"no";"unknown";5;"may";291;1;-1;0;"unknown";"no"
-43;"technician";"married";"secondary";"no";1937;"yes";"no";"unknown";5;"may";181;1;-1;0;"unknown";"no"
-53;"technician";"married";"secondary";"no";384;"yes";"no";"unknown";5;"may";176;1;-1;0;"unknown";"no"
-44;"blue-collar";"married";"secondary";"no";582;"no";"yes";"unknown";5;"may";211;1;-1;0;"unknown";"no"
-55;"services";"divorced";"secondary";"no";91;"no";"no";"unknown";5;"may";349;1;-1;0;"unknown";"no"
-49;"services";"divorced";"secondary";"no";0;"yes";"yes";"unknown";5;"may";272;1;-1;0;"unknown";"no"
-55;"services";"divorced";"secondary";"yes";1;"yes";"no";"unknown";5;"may";208;1;-1;0;"unknown";"no"
-45;"admin.";"single";"secondary";"no";206;"yes";"no";"unknown";5;"may";193;1;-1;0;"unknown";"no"
-47;"services";"divorced";"secondary";"no";164;"no";"no";"unknown";5;"may";212;1;-1;0;"unknown";"no"
-42;"technician";"single";"secondary";"no";690;"yes";"no";"unknown";5;"may";20;1;-1;0;"unknown";"no"
-59;"admin.";"married";"secondary";"no";2343;"yes";"no";"unknown";5;"may";1042;1;-1;0;"unknown";"yes"
-46;"self-employed";"married";"tertiary";"no";137;"yes";"yes";"unknown";5;"may";246;1;-1;0;"unknown";"no"
-51;"blue-collar";"married";"primary";"no";173;"yes";"no";"unknown";5;"may";529;2;-1;0;"unknown";"no"
-56;"admin.";"married";"secondary";"no";45;"no";"no";"unknown";5;"may";1467;1;-1;0;"unknown";"yes"
-41;"technician";"married";"secondary";"no";1270;"yes";"no";"unknown";5;"may";1389;1;-1;0;"unknown";"yes"
-46;"management";"divorced";"secondary";"no";16;"yes";"yes";"unknown";5;"may";188;2;-1;0;"unknown";"no"
-57;"retired";"married";"secondary";"no";486;"yes";"no";"unknown";5;"may";180;2;-1;0;"unknown";"no"
-42;"management";"single";"secondary";"no";50;"no";"no";"unknown";5;"may";48;1;-1;0;"unknown";"no"
-30;"technician";"married";"secondary";"no";152;"yes";"yes";"unknown";5;"may";213;2;-1;0;"unknown";"no"
-60;"admin.";"married";"secondary";"no";290;"yes";"no";"unknown";5;"may";583;1;-1;0;"unknown";"no"
-60;"blue-collar";"married";"unknown";"no";54;"yes";"no";"unknown";5;"may";221;1;-1;0;"unknown";"no"
-57;"entrepreneur";"divorced";"secondary";"no";-37;"no";"no";"unknown";5;"may";173;1;-1;0;"unknown";"no"
-36;"management";"married";"tertiary";"no";101;"yes";"yes";"unknown";5;"may";426;1;-1;0;"unknown";"no"
-55;"blue-collar";"married";"secondary";"no";383;"no";"no";"unknown";5;"may";287;1;-1;0;"unknown";"no"
-60;"retired";"married";"tertiary";"no";81;"yes";"no";"unknown";5;"may";101;1;-1;0;"unknown";"no"
-39;"technician";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";203;1;-1;0;"unknown";"no"
-46;"management";"married";"tertiary";"no";229;"yes";"no";"unknown";5;"may";197;1;-1;0;"unknown";"no"
-44;"blue-collar";"married";"secondary";"no";-674;"yes";"no";"unknown";5;"may";257;1;-1;0;"unknown";"no"
-53;"blue-collar";"married";"primary";"no";90;"no";"no";"unknown";5;"may";124;1;-1;0;"unknown";"no"
-52;"blue-collar";"married";"primary";"no";128;"yes";"no";"unknown";5;"may";229;1;-1;0;"unknown";"no"
-59;"blue-collar";"married";"primary";"no";179;"yes";"no";"unknown";5;"may";55;3;-1;0;"unknown";"no"
-27;"technician";"single";"tertiary";"no";0;"yes";"no";"unknown";5;"may";400;1;-1;0;"unknown";"no"
-44;"blue-collar";"married";"secondary";"no";54;"yes";"no";"unknown";5;"may";197;1;-1;0;"unknown";"no"
-47;"technician";"married";"tertiary";"no";151;"yes";"no";"unknown";5;"may";190;1;-1;0;"unknown";"no"
-34;"admin.";"married";"secondary";"no";61;"no";"yes";"unknown";5;"may";21;1;-1;0;"unknown";"no"
-59;"retired";"single";"secondary";"no";30;"yes";"no";"unknown";5;"may";514;1;-1;0;"unknown";"no"
-45;"management";"married";"tertiary";"no";523;"yes";"no";"unknown";5;"may";849;2;-1;0;"unknown";"no"
-29;"services";"divorced";"secondary";"no";31;"yes";"no";"unknown";5;"may";194;1;-1;0;"unknown";"no"
-46;"technician";"divorced";"secondary";"no";79;"no";"no";"unknown";5;"may";144;1;-1;0;"unknown";"no"
-56;"self-employed";"married";"primary";"no";-34;"yes";"yes";"unknown";5;"may";212;2;-1;0;"unknown";"no"
-36;"blue-collar";"married";"primary";"no";448;"yes";"no";"unknown";5;"may";286;1;-1;0;"unknown";"no"
-59;"retired";"divorced";"primary";"no";81;"yes";"no";"unknown";5;"may";107;1;-1;0;"unknown";"no"
-44;"blue-collar";"married";"secondary";"no";144;"yes";"no";"unknown";5;"may";247;2;-1;0;"unknown";"no"
-41;"admin.";"married";"secondary";"no";351;"yes";"no";"unknown";5;"may";518;1;-1;0;"unknown";"no"
-33;"management";"single";"tertiary";"no";-67;"yes";"no";"unknown";5;"may";364;1;-1;0;"unknown";"no"
-59;"management";"divorced";"tertiary";"no";262;"no";"no";"unknown";5;"may";178;1;-1;0;"unknown";"no"
-57;"technician";"married";"primary";"no";0;"no";"no";"unknown";5;"may";98;1;-1;0;"unknown";"no"
-56;"technician";"divorced";"unknown";"no";56;"yes";"no";"unknown";5;"may";439;1;-1;0;"unknown";"no"
-51;"blue-collar";"married";"secondary";"no";26;"yes";"no";"unknown";5;"may";79;1;-1;0;"unknown";"no"
-34;"admin.";"married";"unknown";"no";3;"yes";"no";"unknown";5;"may";120;3;-1;0;"unknown";"no"
-43;"services";"married";"secondary";"no";41;"yes";"yes";"unknown";5;"may";127;2;-1;0;"unknown";"no"
-52;"technician";"married";"tertiary";"no";7;"no";"yes";"unknown";5;"may";175;1;-1;0;"unknown";"no"
-33;"technician";"single";"secondary";"no";105;"yes";"no";"unknown";5;"may";262;2;-1;0;"unknown";"no"
-29;"admin.";"single";"secondary";"no";818;"yes";"yes";"unknown";5;"may";61;1;-1;0;"unknown";"no"
-34;"services";"married";"secondary";"no";-16;"yes";"yes";"unknown";5;"may";78;1;-1;0;"unknown";"no"
-31;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";143;1;-1;0;"unknown";"no"
-55;"services";"married";"secondary";"no";2476;"yes";"no";"unknown";5;"may";579;1;-1;0;"unknown";"yes"
-55;"management";"married";"unknown";"no";1185;"no";"no";"unknown";5;"may";677;1;-1;0;"unknown";"no"
-32;"admin.";"single";"secondary";"no";217;"yes";"no";"unknown";5;"may";345;1;-1;0;"unknown";"no"
-38;"technician";"single";"secondary";"no";1685;"yes";"no";"unknown";5;"may";185;1;-1;0;"unknown";"no"
-55;"admin.";"single";"secondary";"no";802;"yes";"yes";"unknown";5;"may";100;2;-1;0;"unknown";"no"
-28;"unemployed";"single";"tertiary";"no";0;"yes";"no";"unknown";5;"may";125;2;-1;0;"unknown";"no"
-23;"blue-collar";"married";"secondary";"no";94;"yes";"no";"unknown";5;"may";193;1;-1;0;"unknown";"no"
-32;"technician";"single";"secondary";"no";0;"yes";"no";"unknown";5;"may";136;1;-1;0;"unknown";"no"
-43;"services";"single";"unknown";"no";0;"no";"no";"unknown";5;"may";73;1;-1;0;"unknown";"no"
-32;"blue-collar";"married";"secondary";"no";517;"yes";"no";"unknown";5;"may";528;1;-1;0;"unknown";"no"
-46;"blue-collar";"married";"secondary";"no";265;"yes";"no";"unknown";5;"may";541;1;-1;0;"unknown";"no"
-53;"housemaid";"divorced";"primary";"no";947;"yes";"no";"unknown";5;"may";163;1;-1;0;"unknown";"no"
-34;"self-employed";"single";"secondary";"no";3;"yes";"no";"unknown";5;"may";301;1;-1;0;"unknown";"no"
-57;"unemployed";"married";"tertiary";"no";42;"no";"no";"unknown";5;"may";46;1;-1;0;"unknown";"no"
-37;"blue-collar";"married";"secondary";"no";37;"yes";"no";"unknown";5;"may";204;1;-1;0;"unknown";"no"
-59;"blue-collar";"married";"secondary";"no";57;"yes";"no";"unknown";5;"may";98;1;-1;0;"unknown";"no"
-33;"services";"married";"secondary";"no";22;"yes";"no";"unknown";5;"may";71;1;-1;0;"unknown";"no"
-56;"blue-collar";"divorced";"primary";"no";8;"yes";"no";"unknown";5;"may";157;2;-1;0;"unknown";"no"
-48;"unemployed";"married";"secondary";"no";293;"yes";"no";"unknown";5;"may";243;1;-1;0;"unknown";"no"
-43;"services";"married";"primary";"no";3;"yes";"no";"unknown";5;"may";186;2;-1;0;"unknown";"no"
-54;"blue-collar";"married";"primary";"no";348;"yes";"no";"unknown";5;"may";579;2;-1;0;"unknown";"no"
-51;"blue-collar";"married";"unknown";"no";-19;"yes";"no";"unknown";5;"may";163;2;-1;0;"unknown";"no"
-26;"student";"single";"secondary";"no";0;"yes";"no";"unknown";5;"may";610;2;-1;0;"unknown";"no"
-40;"management";"married";"tertiary";"no";-4;"yes";"no";"unknown";5;"may";2033;1;-1;0;"unknown";"no"
-39;"management";"married";"secondary";"no";18;"yes";"no";"unknown";5;"may";85;1;-1;0;"unknown";"no"
-50;"technician";"married";"primary";"no";139;"no";"no";"unknown";5;"may";114;2;-1;0;"unknown";"no"
-41;"services";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";114;2;-1;0;"unknown";"no"
-51;"blue-collar";"married";"unknown";"no";1883;"yes";"no";"unknown";5;"may";57;1;-1;0;"unknown";"no"
-60;"retired";"divorced";"secondary";"no";216;"yes";"no";"unknown";5;"may";238;1;-1;0;"unknown";"no"
-52;"blue-collar";"married";"secondary";"no";782;"yes";"no";"unknown";5;"may";93;3;-1;0;"unknown";"no"
-48;"blue-collar";"married";"secondary";"no";904;"yes";"no";"unknown";5;"may";128;2;-1;0;"unknown";"no"
-48;"services";"married";"unknown";"no";1705;"yes";"no";"unknown";5;"may";107;1;-1;0;"unknown";"no"
-39;"technician";"single";"tertiary";"no";47;"yes";"no";"unknown";5;"may";181;1;-1;0;"unknown";"no"
-47;"services";"single";"secondary";"no";176;"yes";"no";"unknown";5;"may";303;2;-1;0;"unknown";"no"
-40;"blue-collar";"married";"secondary";"no";1225;"yes";"no";"unknown";5;"may";558;5;-1;0;"unknown";"no"
-45;"technician";"married";"secondary";"no";86;"yes";"no";"unknown";5;"may";270;1;-1;0;"unknown";"no"
-26;"admin.";"single";"secondary";"no";82;"yes";"no";"unknown";5;"may";228;1;-1;0;"unknown";"no"
-52;"management";"married";"tertiary";"no";271;"yes";"no";"unknown";5;"may";99;1;-1;0;"unknown";"no"
-54;"technician";"married";"secondary";"no";1378;"yes";"no";"unknown";5;"may";240;1;-1;0;"unknown";"no"
-54;"admin.";"married";"tertiary";"no";184;"no";"no";"unknown";5;"may";673;2;-1;0;"unknown";"yes"
-50;"blue-collar";"married";"primary";"no";0;"no";"no";"unknown";5;"may";233;3;-1;0;"unknown";"no"
-35;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";1056;1;-1;0;"unknown";"no"
-44;"services";"married";"secondary";"no";1357;"yes";"yes";"unknown";5;"may";250;1;-1;0;"unknown";"no"
-53;"entrepreneur";"married";"unknown";"no";19;"yes";"no";"unknown";5;"may";252;1;-1;0;"unknown";"no"
-35;"retired";"single";"primary";"no";434;"no";"no";"unknown";5;"may";138;1;-1;0;"unknown";"no"
-60;"admin.";"divorced";"secondary";"no";92;"yes";"no";"unknown";5;"may";130;1;-1;0;"unknown";"no"
-53;"admin.";"divorced";"secondary";"no";1151;"yes";"no";"unknown";5;"may";412;1;-1;0;"unknown";"no"
-48;"unemployed";"married";"secondary";"no";41;"yes";"no";"unknown";5;"may";179;2;-1;0;"unknown";"no"
-34;"technician";"married";"secondary";"no";51;"yes";"no";"unknown";5;"may";19;2;-1;0;"unknown";"no"
-54;"services";"married";"secondary";"no";214;"yes";"no";"unknown";5;"may";458;2;-1;0;"unknown";"no"
-51;"management";"married";"secondary";"no";1161;"yes";"no";"unknown";5;"may";717;1;-1;0;"unknown";"no"
-31;"services";"married";"tertiary";"no";37;"yes";"no";"unknown";5;"may";313;1;-1;0;"unknown";"no"
-35;"technician";"divorced";"secondary";"no";787;"yes";"no";"unknown";5;"may";683;2;-1;0;"unknown";"no"
-35;"services";"married";"secondary";"no";59;"yes";"no";"unknown";5;"may";1077;1;-1;0;"unknown";"no"
-38;"technician";"married";"secondary";"no";253;"yes";"no";"unknown";5;"may";416;1;-1;0;"unknown";"no"
-36;"admin.";"married";"tertiary";"no";211;"yes";"no";"unknown";5;"may";146;2;-1;0;"unknown";"no"
-58;"retired";"married";"primary";"no";235;"yes";"no";"unknown";5;"may";167;1;-1;0;"unknown";"no"
-40;"services";"divorced";"unknown";"no";4384;"yes";"no";"unknown";5;"may";315;1;-1;0;"unknown";"no"
-54;"management";"married";"secondary";"no";4080;"no";"no";"unknown";5;"may";140;1;-1;0;"unknown";"no"
-34;"blue-collar";"single";"secondary";"no";53;"yes";"yes";"unknown";5;"may";346;1;-1;0;"unknown";"no"
-31;"management";"single";"tertiary";"no";0;"yes";"no";"unknown";5;"may";562;1;-1;0;"unknown";"no"
-51;"retired";"married";"secondary";"no";2127;"yes";"no";"unknown";5;"may";172;1;-1;0;"unknown";"no"
-33;"management";"married";"tertiary";"no";377;"yes";"no";"unknown";5;"may";217;1;-1;0;"unknown";"no"
-55;"management";"married";"tertiary";"no";73;"yes";"no";"unknown";5;"may";142;2;-1;0;"unknown";"no"
-42;"admin.";"married";"secondary";"no";445;"yes";"no";"unknown";5;"may";67;1;-1;0;"unknown";"no"
-34;"blue-collar";"married";"secondary";"no";243;"yes";"no";"unknown";5;"may";291;1;-1;0;"unknown";"no"
-33;"blue-collar";"single";"secondary";"no";307;"yes";"no";"unknown";5;"may";309;2;-1;0;"unknown";"no"
-38;"services";"married";"secondary";"no";155;"yes";"no";"unknown";5;"may";248;1;-1;0;"unknown";"no"
-50;"technician";"divorced";"tertiary";"no";173;"no";"yes";"unknown";5;"may";98;1;-1;0;"unknown";"no"
-43;"management";"married";"tertiary";"no";400;"yes";"no";"unknown";5;"may";256;1;-1;0;"unknown";"no"
-61;"blue-collar";"divorced";"primary";"no";1428;"yes";"no";"unknown";5;"may";82;2;-1;0;"unknown";"no"
-47;"admin.";"married";"secondary";"no";219;"yes";"no";"unknown";5;"may";577;1;-1;0;"unknown";"no"
-48;"self-employed";"married";"tertiary";"no";7;"yes";"no";"unknown";5;"may";286;1;-1;0;"unknown";"no"
-44;"blue-collar";"married";"secondary";"no";575;"yes";"no";"unknown";5;"may";477;1;-1;0;"unknown";"no"
-35;"student";"single";"unknown";"no";298;"yes";"no";"unknown";5;"may";611;2;-1;0;"unknown";"no"
-35;"services";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";471;1;-1;0;"unknown";"no"
-50;"services";"married";"secondary";"no";5699;"yes";"no";"unknown";5;"may";381;2;-1;0;"unknown";"no"
-41;"management";"married";"tertiary";"no";176;"yes";"yes";"unknown";5;"may";42;1;-1;0;"unknown";"no"
-41;"management";"married";"tertiary";"no";517;"yes";"no";"unknown";5;"may";251;1;-1;0;"unknown";"no"
-39;"services";"single";"unknown";"no";257;"yes";"no";"unknown";5;"may";408;1;-1;0;"unknown";"no"
-42;"retired";"married";"secondary";"no";56;"yes";"no";"unknown";5;"may";215;1;-1;0;"unknown";"no"
-38;"blue-collar";"married";"secondary";"no";-390;"yes";"no";"unknown";5;"may";287;1;-1;0;"unknown";"no"
-53;"retired";"married";"secondary";"no";330;"yes";"no";"unknown";5;"may";216;2;-1;0;"unknown";"no"
-59;"housemaid";"divorced";"primary";"no";195;"no";"no";"unknown";5;"may";366;2;-1;0;"unknown";"no"
-36;"services";"married";"secondary";"no";301;"yes";"no";"unknown";5;"may";210;1;-1;0;"unknown";"no"
-54;"blue-collar";"married";"primary";"no";-41;"yes";"no";"unknown";5;"may";288;1;-1;0;"unknown";"no"
-40;"technician";"married";"tertiary";"no";483;"yes";"no";"unknown";5;"may";168;1;-1;0;"unknown";"no"
-47;"unknown";"married";"unknown";"no";28;"no";"no";"unknown";5;"may";338;2;-1;0;"unknown";"no"
-53;"unemployed";"married";"unknown";"no";13;"no";"no";"unknown";5;"may";410;3;-1;0;"unknown";"no"
-46;"housemaid";"married";"primary";"no";965;"no";"no";"unknown";5;"may";177;1;-1;0;"unknown";"no"
-39;"management";"married";"tertiary";"no";378;"yes";"yes";"unknown";5;"may";127;2;-1;0;"unknown";"no"
-40;"unemployed";"married";"secondary";"no";219;"yes";"no";"unknown";5;"may";357;1;-1;0;"unknown";"no"
-28;"blue-collar";"married";"primary";"no";324;"yes";"no";"unknown";5;"may";175;1;-1;0;"unknown";"no"
-35;"entrepreneur";"divorced";"secondary";"no";-69;"yes";"no";"unknown";5;"may";300;1;-1;0;"unknown";"no"
-55;"retired";"married";"secondary";"no";0;"no";"yes";"unknown";5;"may";136;1;-1;0;"unknown";"no"
-43;"technician";"divorced";"unknown";"no";205;"yes";"no";"unknown";5;"may";1419;1;-1;0;"unknown";"no"
-48;"blue-collar";"married";"primary";"no";278;"yes";"no";"unknown";5;"may";125;2;-1;0;"unknown";"no"
-58;"management";"married";"unknown";"no";1065;"yes";"no";"unknown";5;"may";213;3;-1;0;"unknown";"no"
-33;"management";"single";"tertiary";"no";34;"yes";"no";"unknown";5;"may";27;1;-1;0;"unknown";"no"
-36;"blue-collar";"married";"unknown";"no";1033;"no";"no";"unknown";5;"may";238;2;-1;0;"unknown";"no"
-53;"services";"divorced";"secondary";"no";1467;"yes";"no";"unknown";5;"may";124;1;-1;0;"unknown";"no"
-47;"blue-collar";"married";"primary";"no";-12;"yes";"no";"unknown";5;"may";18;1;-1;0;"unknown";"no"
-31;"services";"married";"secondary";"no";388;"yes";"no";"unknown";5;"may";730;2;-1;0;"unknown";"no"
-57;"entrepreneur";"married";"secondary";"no";294;"yes";"no";"unknown";5;"may";746;2;-1;0;"unknown";"no"
-53;"blue-collar";"married";"unknown";"no";1827;"no";"no";"unknown";5;"may";121;1;-1;0;"unknown";"no"
-55;"blue-collar";"married";"primary";"no";627;"yes";"no";"unknown";5;"may";247;1;-1;0;"unknown";"no"
-45;"blue-collar";"married";"primary";"no";25;"yes";"no";"unknown";5;"may";40;1;-1;0;"unknown";"no"
-53;"admin.";"divorced";"secondary";"no";315;"yes";"no";"unknown";5;"may";181;2;-1;0;"unknown";"no"
-37;"blue-collar";"married";"primary";"no";0;"yes";"no";"unknown";5;"may";79;1;-1;0;"unknown";"no"
-44;"admin.";"divorced";"secondary";"no";66;"yes";"no";"unknown";5;"may";206;1;-1;0;"unknown";"no"
-49;"blue-collar";"divorced";"primary";"no";-9;"yes";"yes";"unknown";5;"may";389;1;-1;0;"unknown";"no"
-46;"technician";"married";"secondary";"no";349;"yes";"yes";"unknown";5;"may";127;1;-1;0;"unknown";"no"
-43;"entrepreneur";"married";"unknown";"no";100;"yes";"no";"unknown";5;"may";702;1;-1;0;"unknown";"no"
-38;"admin.";"married";"secondary";"no";0;"no";"no";"unknown";5;"may";151;1;-1;0;"unknown";"no"
-43;"technician";"married";"secondary";"no";434;"yes";"no";"unknown";5;"may";117;1;-1;0;"unknown";"no"
-49;"management";"married";"tertiary";"no";3237;"yes";"no";"unknown";5;"may";232;3;-1;0;"unknown";"no"
-42;"management";"married";"unknown";"no";275;"no";"no";"unknown";5;"may";408;2;-1;0;"unknown";"no"
-22;"blue-collar";"single";"secondary";"no";0;"yes";"no";"unknown";5;"may";179;2;-1;0;"unknown";"no"
-40;"management";"married";"tertiary";"no";207;"yes";"no";"unknown";5;"may";39;1;-1;0;"unknown";"no"
-39;"blue-collar";"married";"secondary";"no";483;"yes";"no";"unknown";5;"may";282;1;-1;0;"unknown";"no"
-51;"services";"married";"secondary";"no";2248;"yes";"no";"unknown";5;"may";714;2;-1;0;"unknown";"no"
-49;"admin.";"married";"secondary";"no";428;"yes";"no";"unknown";5;"may";50;1;-1;0;"unknown";"no"
-53;"blue-collar";"married";"secondary";"no";0;"yes";"yes";"unknown";5;"may";181;1;-1;0;"unknown";"no"
-34;"services";"single";"secondary";"no";0;"yes";"no";"unknown";5;"may";142;1;-1;0;"unknown";"no"
-33;"technician";"divorced";"secondary";"no";140;"yes";"no";"unknown";5;"may";227;1;-1;0;"unknown";"no"
-50;"management";"single";"tertiary";"no";297;"yes";"no";"unknown";5;"may";119;1;-1;0;"unknown";"no"
-39;"blue-collar";"married";"primary";"no";279;"yes";"no";"unknown";5;"may";361;1;-1;0;"unknown";"no"
-59;"entrepreneur";"divorced";"secondary";"no";901;"yes";"no";"unknown";5;"may";73;3;-1;0;"unknown";"no"
-30;"technician";"single";"secondary";"no";2573;"yes";"no";"unknown";5;"may";67;2;-1;0;"unknown";"no"
-36;"services";"married";"secondary";"no";143;"yes";"yes";"unknown";5;"may";350;1;-1;0;"unknown";"no"
-40;"blue-collar";"married";"secondary";"no";475;"yes";"no";"unknown";5;"may";332;2;-1;0;"unknown";"no"
-53;"blue-collar";"married";"secondary";"no";70;"yes";"no";"unknown";5;"may";611;2;-1;0;"unknown";"no"
-34;"management";"single";"tertiary";"no";318;"yes";"no";"unknown";5;"may";113;2;-1;0;"unknown";"no"
-37;"technician";"married";"secondary";"no";275;"yes";"no";"unknown";5;"may";132;1;-1;0;"unknown";"no"
-42;"management";"divorced";"tertiary";"no";742;"yes";"no";"unknown";5;"may";58;3;-1;0;"unknown";"no"
-41;"entrepreneur";"married";"primary";"no";236;"yes";"no";"unknown";5;"may";151;1;-1;0;"unknown";"no"
-30;"student";"single";"tertiary";"no";25;"yes";"no";"unknown";5;"may";89;2;-1;0;"unknown";"no"
-37;"management";"single";"tertiary";"no";600;"yes";"no";"unknown";5;"may";152;1;-1;0;"unknown";"no"
-39;"admin.";"divorced";"secondary";"no";-349;"yes";"no";"unknown";5;"may";611;2;-1;0;"unknown";"no"
-41;"blue-collar";"married";"primary";"no";183;"yes";"yes";"unknown";5;"may";110;2;-1;0;"unknown";"no"
-40;"blue-collar";"married";"primary";"no";0;"yes";"no";"unknown";5;"may";463;1;-1;0;"unknown";"no"
-42;"management";"single";"tertiary";"no";0;"yes";"yes";"unknown";5;"may";562;2;-1;0;"unknown";"yes"
-40;"blue-collar";"divorced";"primary";"no";0;"yes";"no";"unknown";5;"may";962;1;-1;0;"unknown";"no"
-57;"technician";"married";"secondary";"no";1078;"yes";"no";"unknown";5;"may";10;4;-1;0;"unknown";"no"
-56;"entrepreneur";"divorced";"secondary";"no";155;"no";"no";"unknown";5;"may";118;3;-1;0;"unknown";"no"
-37;"admin.";"married";"secondary";"no";190;"yes";"no";"unknown";5;"may";92;2;-1;0;"unknown";"no"
-59;"retired";"married";"secondary";"no";319;"yes";"no";"unknown";5;"may";143;3;-1;0;"unknown";"no"
-39;"services";"divorced";"secondary";"no";-185;"yes";"no";"unknown";5;"may";189;3;-1;0;"unknown";"no"
-49;"services";"married";"secondary";"no";47;"no";"no";"unknown";5;"may";234;2;-1;0;"unknown";"no"
-38;"services";"single";"secondary";"no";570;"yes";"no";"unknown";5;"may";75;2;-1;0;"unknown";"no"
-36;"self-employed";"married";"tertiary";"no";19;"no";"no";"unknown";5;"may";189;2;-1;0;"unknown";"no"
-50;"blue-collar";"married";"primary";"no";61;"yes";"no";"unknown";5;"may";621;3;-1;0;"unknown";"no"
-41;"admin.";"married";"secondary";"no";-62;"yes";"yes";"unknown";5;"may";55;2;-1;0;"unknown";"no"
-54;"technician";"married";"tertiary";"no";258;"no";"no";"unknown";5;"may";310;4;-1;0;"unknown";"no"
-58;"blue-collar";"married";"primary";"no";76;"yes";"no";"unknown";5;"may";156;2;-1;0;"unknown";"no"
-30;"blue-collar";"single";"secondary";"no";0;"yes";"no";"unknown";5;"may";5;2;-1;0;"unknown";"no"
-33;"admin.";"single";"secondary";"no";352;"yes";"no";"unknown";5;"may";225;2;-1;0;"unknown";"no"
-47;"admin.";"married";"secondary";"no";368;"yes";"no";"unknown";5;"may";125;2;-1;0;"unknown";"no"
-50;"technician";"single";"tertiary";"no";339;"yes";"no";"unknown";5;"may";2;3;-1;0;"unknown";"no"
-32;"blue-collar";"married";"secondary";"no";1331;"yes";"no";"unknown";5;"may";286;2;-1;0;"unknown";"no"
-40;"self-employed";"married";"secondary";"no";672;"yes";"no";"unknown";5;"may";164;2;-1;0;"unknown";"no"
-37;"management";"married";"tertiary";"no";58;"yes";"no";"unknown";5;"may";98;2;-1;0;"unknown";"no"
-54;"technician";"single";"unknown";"no";447;"yes";"no";"unknown";5;"may";742;2;-1;0;"unknown";"no"
-24;"student";"single";"secondary";"no";423;"yes";"no";"unknown";5;"may";226;3;-1;0;"unknown";"no"
-54;"management";"married";"tertiary";"no";0;"no";"no";"unknown";5;"may";120;2;-1;0;"unknown";"no"
-34;"technician";"married";"secondary";"no";19;"yes";"no";"unknown";5;"may";362;4;-1;0;"unknown";"no"
-56;"technician";"divorced";"primary";"no";13;"yes";"no";"unknown";5;"may";357;2;-1;0;"unknown";"no"
-31;"blue-collar";"single";"secondary";"no";3;"yes";"no";"unknown";5;"may";200;2;-1;0;"unknown";"no"
-24;"student";"single";"secondary";"no";82;"yes";"no";"unknown";5;"may";204;2;-1;0;"unknown";"no"
-42;"blue-collar";"divorced";"primary";"no";28;"yes";"no";"unknown";5;"may";126;3;-1;0;"unknown";"no"
-57;"technician";"married";"secondary";"no";792;"yes";"no";"unknown";5;"may";65;2;-1;0;"unknown";"no"
-42;"blue-collar";"married";"unknown";"no";408;"yes";"no";"unknown";5;"may";107;2;-1;0;"unknown";"no"
-51;"admin.";"married";"secondary";"no";531;"yes";"no";"unknown";5;"may";267;2;-1;0;"unknown";"no"
-57;"retired";"married";"secondary";"no";211;"yes";"no";"unknown";5;"may";248;2;-1;0;"unknown";"no"
-36;"services";"single";"secondary";"no";62;"yes";"no";"unknown";5;"may";215;2;-1;0;"unknown";"no"
-53;"services";"married";"unknown";"no";257;"yes";"no";"unknown";5;"may";209;2;-1;0;"unknown";"no"
-50;"technician";"married";"secondary";"no";1234;"yes";"no";"unknown";5;"may";205;2;-1;0;"unknown";"no"
-54;"management";"married";"tertiary";"no";313;"yes";"no";"unknown";5;"may";83;2;-1;0;"unknown";"no"
-55;"blue-collar";"married";"secondary";"no";89;"yes";"no";"unknown";5;"may";106;3;-1;0;"unknown";"no"
-44;"blue-collar";"married";"secondary";"no";129;"yes";"yes";"unknown";5;"may";189;2;-1;0;"unknown";"no"
-43;"management";"married";"unknown";"no";0;"yes";"no";"unknown";5;"may";105;2;-1;0;"unknown";"no"
-56;"admin.";"married";"secondary";"no";353;"yes";"no";"unknown";5;"may";106;2;-1;0;"unknown";"no"
-54;"technician";"married";"unknown";"no";851;"yes";"no";"unknown";5;"may";108;2;-1;0;"unknown";"no"
-55;"services";"divorced";"primary";"no";96;"yes";"yes";"unknown";5;"may";311;2;-1;0;"unknown";"no"
-37;"services";"divorced";"secondary";"no";398;"yes";"yes";"unknown";5;"may";214;2;-1;0;"unknown";"no"
-33;"admin.";"single";"tertiary";"no";193;"no";"no";"unknown";5;"may";132;2;-1;0;"unknown";"no"
-46;"admin.";"married";"secondary";"no";-358;"yes";"no";"unknown";5;"may";358;2;-1;0;"unknown";"no"
-36;"blue-collar";"married";"secondary";"no";539;"yes";"yes";"unknown";5;"may";453;2;-1;0;"unknown";"no"
-51;"technician";"single";"secondary";"no";0;"yes";"no";"unknown";5;"may";364;2;-1;0;"unknown";"no"
-40;"retired";"single";"primary";"no";0;"no";"no";"unknown";5;"may";136;2;-1;0;"unknown";"no"
-42;"blue-collar";"married";"secondary";"no";490;"yes";"no";"unknown";5;"may";386;2;-1;0;"unknown";"no"
-51;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";173;2;-1;0;"unknown";"no"
-49;"blue-collar";"married";"unknown";"no";403;"yes";"no";"unknown";5;"may";241;2;-1;0;"unknown";"no"
-48;"management";"married";"secondary";"no";161;"yes";"no";"unknown";5;"may";224;3;-1;0;"unknown";"no"
-32;"technician";"divorced";"tertiary";"no";2558;"no";"no";"unknown";5;"may";148;2;-1;0;"unknown";"no"
-31;"admin.";"single";"secondary";"no";98;"yes";"no";"unknown";5;"may";196;2;-1;0;"unknown";"no"
-55;"management";"single";"tertiary";"no";115;"no";"no";"unknown";5;"may";111;4;-1;0;"unknown";"no"
-40;"blue-collar";"single";"secondary";"no";436;"yes";"no";"unknown";5;"may";231;3;-1;0;"unknown";"no"
-47;"technician";"married";"tertiary";"no";831;"yes";"no";"unknown";5;"may";316;3;-1;0;"unknown";"no"
-57;"technician";"married";"unknown";"no";206;"yes";"no";"unknown";5;"may";216;3;-1;0;"unknown";"no"
-41;"blue-collar";"married";"secondary";"no";290;"yes";"no";"unknown";5;"may";240;2;-1;0;"unknown";"no"
-48;"blue-collar";"married";"secondary";"no";1;"no";"no";"unknown";5;"may";669;3;-1;0;"unknown";"no"
-42;"blue-collar";"married";"unknown";"no";57;"yes";"no";"unknown";5;"may";425;2;-1;0;"unknown";"no"
-30;"blue-collar";"single";"secondary";"no";-457;"yes";"no";"unknown";5;"may";143;2;-1;0;"unknown";"no"
-58;"management";"single";"tertiary";"no";1387;"yes";"no";"unknown";5;"may";174;5;-1;0;"unknown";"no"
-45;"management";"divorced";"tertiary";"no";24598;"yes";"no";"unknown";5;"may";313;3;-1;0;"unknown";"no"
-49;"blue-collar";"married";"secondary";"no";30;"yes";"no";"unknown";5;"may";135;4;-1;0;"unknown";"no"
-42;"admin.";"single";"secondary";"no";1022;"yes";"no";"unknown";5;"may";146;2;-1;0;"unknown";"no"
-53;"technician";"married";"secondary";"no";56;"yes";"yes";"unknown";5;"may";152;2;-1;0;"unknown";"no"
-51;"admin.";"single";"secondary";"yes";-2;"no";"no";"unknown";5;"may";402;3;-1;0;"unknown";"no"
-32;"services";"single";"secondary";"no";121;"yes";"no";"unknown";5;"may";213;2;-1;0;"unknown";"no"
-41;"blue-collar";"single";"secondary";"no";842;"yes";"no";"unknown";5;"may";144;3;-1;0;"unknown";"no"
-43;"management";"divorced";"secondary";"no";693;"yes";"no";"unknown";5;"may";124;3;-1;0;"unknown";"no"
-40;"blue-collar";"divorced";"secondary";"no";-333;"yes";"no";"unknown";5;"may";183;2;-1;0;"unknown";"no"
-50;"blue-collar";"married";"primary";"no";1533;"yes";"no";"unknown";5;"may";325;2;-1;0;"unknown";"no"
-34;"management";"married";"tertiary";"no";46;"yes";"no";"unknown";5;"may";39;4;-1;0;"unknown";"no"
-53;"services";"married";"unknown";"no";18;"no";"no";"unknown";5;"may";503;2;-1;0;"unknown";"no"
-45;"technician";"married";"secondary";"no";44;"yes";"no";"unknown";5;"may";95;4;-1;0;"unknown";"no"
-39;"blue-collar";"married";"primary";"no";-100;"yes";"no";"unknown";5;"may";680;2;-1;0;"unknown";"no"
-44;"services";"married";"tertiary";"no";510;"yes";"no";"unknown";5;"may";421;4;-1;0;"unknown";"no"
-55;"management";"married";"tertiary";"no";685;"yes";"no";"unknown";5;"may";174;3;-1;0;"unknown";"no"
-46;"management";"single";"tertiary";"no";187;"yes";"no";"unknown";5;"may";113;2;-1;0;"unknown";"no"
-45;"blue-collar";"married";"secondary";"no";66;"yes";"no";"unknown";5;"may";808;2;-1;0;"unknown";"no"
-34;"admin.";"married";"secondary";"no";560;"yes";"no";"unknown";5;"may";198;3;-1;0;"unknown";"no"
-36;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";5;"may";195;2;-1;0;"unknown";"no"
-59;"unknown";"divorced";"unknown";"no";27;"no";"no";"unknown";5;"may";347;3;-1;0;"unknown";"no"
-31;"admin.";"single";"secondary";"no";12;"yes";"no";"unknown";5;"may";208;2;-1;0;"unknown";"no"
-44;"blue-collar";"single";"secondary";"no";34;"yes";"no";"unknown";5;"may";404;4;-1;0;"unknown";"no"
-33;"entrepreneur";"single";"tertiary";"no";1068;"yes";"no";"unknown";5;"may";396;2;-1;0;"unknown";"no"
-40;"blue-collar";"married";"secondary";"no";211;"yes";"no";"unknown";5;"may";216;4;-1;0;"unknown";"no"
-46;"admin.";"single";"tertiary";"no";377;"yes";"no";"unknown";5;"may";98;2;-1;0;"unknown";"no"
-48;"management";"married";"tertiary";"no";263;"yes";"no";"unknown";5;"may";350;2;-1;0;"unknown";"no"
-42;"services";"married";"secondary";"no";1263;"yes";"no";"unknown";6;"may";114;2;-1;0;"unknown";"no"
-27;"services";"married";"secondary";"no";8;"yes";"no";"unknown";6;"may";88;3;-1;0;"unknown";"no"
-48;"admin.";"married";"secondary";"no";126;"yes";"yes";"unknown";6;"may";379;2;-1;0;"unknown";"no"
-59;"admin.";"married";"secondary";"no";230;"yes";"no";"unknown";6;"may";190;3;-1;0;"unknown";"no"
-46;"technician";"married";"tertiary";"no";841;"yes";"no";"unknown";6;"may";158;1;-1;0;"unknown";"no"
-38;"admin.";"divorced";"secondary";"no";308;"yes";"no";"unknown";6;"may";102;1;-1;0;"unknown";"no"
-43;"management";"divorced";"tertiary";"no";1;"yes";"no";"unknown";6;"may";306;1;-1;0;"unknown";"no"
-38;"admin.";"divorced";"tertiary";"no";86;"yes";"no";"unknown";6;"may";218;1;-1;0;"unknown";"no"
-23;"student";"single";"secondary";"no";157;"yes";"no";"unknown";6;"may";54;1;-1;0;"unknown";"no"
-34;"services";"married";"secondary";"no";22;"yes";"no";"unknown";6;"may";344;1;-1;0;"unknown";"no"
-38;"admin.";"married";"secondary";"no";46;"yes";"yes";"unknown";6;"may";195;1;-1;0;"unknown";"no"
-37;"blue-collar";"married";"secondary";"no";1293;"no";"no";"unknown";6;"may";652;1;-1;0;"unknown";"no"
-25;"admin.";"single";"secondary";"no";122;"yes";"no";"unknown";6;"may";286;1;-1;0;"unknown";"no"
-48;"blue-collar";"married";"unknown";"no";131;"yes";"no";"unknown";6;"may";189;1;-1;0;"unknown";"no"
-49;"blue-collar";"single";"secondary";"no";143;"yes";"no";"unknown";6;"may";83;1;-1;0;"unknown";"no"
-38;"admin.";"single";"secondary";"no";393;"no";"no";"unknown";6;"may";184;2;-1;0;"unknown";"no"
-43;"blue-collar";"married";"primary";"no";98;"yes";"no";"unknown";6;"may";235;1;-1;0;"unknown";"no"
-33;"management";"married";"tertiary";"no";0;"yes";"no";"unknown";6;"may";290;1;-1;0;"unknown";"no"
-55;"blue-collar";"married";"secondary";"no";224;"yes";"no";"unknown";6;"may";232;1;-1;0;"unknown";"no"
-38;"blue-collar";"married";"secondary";"no";757;"yes";"no";"unknown";6;"may";133;1;-1;0;"unknown";"no"
-49;"services";"married";"secondary";"no";245;"yes";"yes";"unknown";6;"may";318;1;-1;0;"unknown";"no"
-40;"management";"married";"secondary";"no";8486;"no";"no";"unknown";6;"may";260;3;-1;0;"unknown";"no"
-43;"admin.";"married";"unknown";"no";350;"no";"no";"unknown";6;"may";437;1;-1;0;"unknown";"no"
-38;"blue-collar";"married";"secondary";"no";20;"yes";"no";"unknown";6;"may";402;1;-1;0;"unknown";"no"
-58;"services";"married";"secondary";"no";1667;"yes";"yes";"unknown";6;"may";85;1;-1;0;"unknown";"no"
-57;"technician";"married";"unknown";"no";345;"yes";"no";"unknown";6;"may";125;1;-1;0;"unknown";"no"
-32;"unemployed";"married";"secondary";"no";10;"yes";"no";"unknown";6;"may";501;4;-1;0;"unknown";"no"
-56;"management";"married";"tertiary";"no";830;"yes";"yes";"unknown";6;"may";1201;1;-1;0;"unknown";"yes"
-58;"blue-collar";"divorced";"unknown";"no";29;"yes";"no";"unknown";6;"may";253;1;-1;0;"unknown";"no"
-60;"retired";"divorced";"secondary";"no";545;"yes";"no";"unknown";6;"may";1030;1;-1;0;"unknown";"yes"
-37;"technician";"married";"tertiary";"no";8730;"yes";"no";"unknown";6;"may";149;1;-1;0;"unknown";"no"
-46;"technician";"divorced";"tertiary";"no";477;"yes";"no";"unknown";6;"may";114;1;-1;0;"unknown";"no"
-27;"admin.";"married";"secondary";"no";4;"yes";"no";"unknown";6;"may";243;1;-1;0;"unknown";"no"
-43;"blue-collar";"married";"secondary";"no";1205;"yes";"no";"unknown";6;"may";769;2;-1;0;"unknown";"no"
-32;"technician";"single";"secondary";"no";0;"yes";"yes";"unknown";6;"may";135;3;-1;0;"unknown";"no"
-40;"admin.";"single";"secondary";"no";263;"yes";"no";"unknown";6;"may";231;1;-1;0;"unknown";"no"
-38;"admin.";"married";"secondary";"no";1;"no";"no";"unknown";6;"may";76;1;-1;0;"unknown";"no"
-34;"blue-collar";"married";"secondary";"no";283;"no";"yes";"unknown";6;"may";199;1;-1;0;"unknown";"no"
-47;"blue-collar";"married";"primary";"no";206;"yes";"no";"unknown";6;"may";152;1;-1;0;"unknown";"no"
-42;"housemaid";"married";"primary";"no";17;"yes";"no";"unknown";6;"may";124;1;-1;0;"unknown";"no"
-48;"technician";"married";"secondary";"no";141;"yes";"yes";"unknown";6;"may";424;1;-1;0;"unknown";"no"
-29;"self-employed";"single";"tertiary";"no";16;"yes";"no";"unknown";6;"may";43;1;-1;0;"unknown";"no"
-50;"services";"married";"secondary";"no";206;"yes";"no";"unknown";6;"may";154;1;-1;0;"unknown";"no"
-52;"technician";"married";"unknown";"no";7;"no";"no";"unknown";6;"may";203;2;-1;0;"unknown";"no"
-50;"management";"married";"tertiary";"no";0;"no";"no";"unknown";6;"may";326;1;-1;0;"unknown";"no"
-58;"retired";"married";"tertiary";"no";0;"no";"no";"unknown";6;"may";393;1;-1;0;"unknown";"no"
-46;"blue-collar";"divorced";"primary";"no";1927;"yes";"no";"unknown";6;"may";241;3;-1;0;"unknown";"no"
-38;"technician";"married";"secondary";"no";284;"yes";"no";"unknown";6;"may";483;1;-1;0;"unknown";"no"
-46;"blue-collar";"married";"secondary";"no";1660;"yes";"no";"unknown";6;"may";259;1;-1;0;"unknown";"no"
-32;"services";"single";"secondary";"no";406;"yes";"no";"unknown";6;"may";227;1;-1;0;"unknown";"no"
-51;"blue-collar";"married";"primary";"no";230;"yes";"no";"unknown";6;"may";673;1;-1;0;"unknown";"no"
-39;"admin.";"single";"secondary";"no";-25;"yes";"no";"unknown";6;"may";576;1;-1;0;"unknown";"no"
-48;"admin.";"married";"secondary";"no";182;"yes";"no";"unknown";6;"may";180;2;-1;0;"unknown";"no"
-36;"entrepreneur";"married";"tertiary";"no";1169;"yes";"no";"unknown";6;"may";168;2;-1;0;"unknown";"no"
-34;"admin.";"divorced";"secondary";"no";67;"yes";"no";"unknown";6;"may";90;1;-1;0;"unknown";"no"
-40;"technician";"married";"secondary";"no";77;"no";"no";"unknown";6;"may";505;1;-1;0;"unknown";"no"
-43;"unemployed";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";245;1;-1;0;"unknown";"no"
-52;"blue-collar";"divorced";"primary";"no";55;"yes";"yes";"unknown";6;"may";186;1;-1;0;"unknown";"no"
-33;"technician";"married";"secondary";"yes";72;"yes";"no";"unknown";6;"may";623;1;-1;0;"unknown";"no"
-49;"management";"single";"tertiary";"no";163;"yes";"no";"unknown";6;"may";496;3;-1;0;"unknown";"no"
-32;"management";"single";"tertiary";"no";151;"yes";"no";"unknown";6;"may";118;1;-1;0;"unknown";"no"
-39;"admin.";"single";"secondary";"no";113;"yes";"no";"unknown";6;"may";342;1;-1;0;"unknown";"no"
-40;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";225;1;-1;0;"unknown";"no"
-38;"technician";"single";"tertiary";"no";9;"yes";"no";"unknown";6;"may";185;3;-1;0;"unknown";"no"
-43;"management";"married";"secondary";"no";375;"yes";"no";"unknown";6;"may";196;2;-1;0;"unknown";"no"
-39;"services";"married";"secondary";"no";1142;"yes";"no";"unknown";6;"may";276;1;-1;0;"unknown";"no"
-54;"blue-collar";"married";"primary";"no";2102;"yes";"no";"unknown";6;"may";76;1;-1;0;"unknown";"no"
-38;"technician";"single";"tertiary";"no";4325;"yes";"no";"unknown";6;"may";87;1;-1;0;"unknown";"no"
-40;"blue-collar";"married";"secondary";"no";217;"yes";"no";"unknown";6;"may";195;1;-1;0;"unknown";"no"
-55;"admin.";"married";"secondary";"no";131;"yes";"no";"unknown";6;"may";744;1;-1;0;"unknown";"no"
-42;"management";"married";"tertiary";"no";1680;"yes";"no";"unknown";6;"may";765;1;-1;0;"unknown";"no"
-39;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";262;1;-1;0;"unknown";"no"
-40;"blue-collar";"married";"primary";"no";46;"yes";"no";"unknown";6;"may";119;1;-1;0;"unknown";"no"
-32;"blue-collar";"married";"secondary";"no";320;"yes";"no";"unknown";6;"may";198;2;-1;0;"unknown";"no"
-55;"admin.";"married";"secondary";"no";183;"yes";"no";"unknown";6;"may";150;1;-1;0;"unknown";"no"
-45;"blue-collar";"married";"secondary";"no";39;"no";"no";"unknown";6;"may";241;1;-1;0;"unknown";"no"
-35;"management";"single";"tertiary";"no";560;"yes";"no";"unknown";6;"may";181;1;-1;0;"unknown";"no"
-58;"technician";"divorced";"secondary";"no";469;"no";"no";"unknown";6;"may";196;1;-1;0;"unknown";"no"
-35;"admin.";"married";"secondary";"no";530;"yes";"no";"unknown";6;"may";149;1;-1;0;"unknown";"no"
-49;"services";"married";"primary";"no";61;"yes";"no";"unknown";6;"may";246;1;-1;0;"unknown";"no"
-34;"technician";"single";"tertiary";"no";242;"yes";"no";"unknown";6;"may";112;1;-1;0;"unknown";"no"
-36;"blue-collar";"married";"secondary";"no";139;"yes";"no";"unknown";6;"may";309;2;-1;0;"unknown";"no"
-24;"self-employed";"single";"secondary";"no";0;"yes";"no";"unknown";6;"may";278;1;-1;0;"unknown";"no"
-34;"technician";"married";"secondary";"no";367;"yes";"no";"unknown";6;"may";140;1;-1;0;"unknown";"no"
-51;"admin.";"divorced";"secondary";"no";228;"yes";"no";"unknown";6;"may";136;1;-1;0;"unknown";"no"
-39;"technician";"single";"unknown";"no";45248;"yes";"no";"unknown";6;"may";1623;1;-1;0;"unknown";"yes"
-50;"self-employed";"married";"unknown";"no";-84;"yes";"no";"unknown";6;"may";101;1;-1;0;"unknown";"no"
-32;"services";"single";"secondary";"no";310;"yes";"no";"unknown";6;"may";144;1;-1;0;"unknown";"no"
-42;"blue-collar";"married";"unknown";"no";132;"yes";"no";"unknown";6;"may";238;1;-1;0;"unknown";"no"
-50;"technician";"married";"secondary";"no";797;"yes";"no";"unknown";6;"may";354;1;-1;0;"unknown";"no"
-40;"services";"married";"secondary";"no";71;"no";"no";"unknown";6;"may";79;1;-1;0;"unknown";"no"
-46;"management";"divorced";"unknown";"no";2;"yes";"no";"unknown";6;"may";187;1;-1;0;"unknown";"no"
-37;"management";"married";"tertiary";"no";231;"yes";"yes";"unknown";6;"may";451;2;-1;0;"unknown";"no"
-34;"blue-collar";"married";"secondary";"no";270;"yes";"yes";"unknown";6;"may";159;1;-1;0;"unknown";"no"
-44;"blue-collar";"married";"secondary";"no";274;"yes";"yes";"unknown";6;"may";409;1;-1;0;"unknown";"no"
-40;"admin.";"single";"secondary";"no";-109;"yes";"yes";"unknown";6;"may";170;1;-1;0;"unknown";"no"
-37;"technician";"married";"secondary";"no";1;"yes";"no";"unknown";6;"may";608;1;-1;0;"unknown";"yes"
-33;"blue-collar";"single";"secondary";"yes";-60;"no";"no";"unknown";6;"may";243;1;-1;0;"unknown";"no"
-35;"blue-collar";"married";"secondary";"no";89;"yes";"no";"unknown";6;"may";145;2;-1;0;"unknown";"no"
-58;"blue-collar";"divorced";"secondary";"no";-11;"no";"no";"unknown";6;"may";112;1;-1;0;"unknown";"no"
-39;"blue-collar";"married";"primary";"no";0;"yes";"no";"unknown";6;"may";262;2;-1;0;"unknown";"no"
-43;"blue-collar";"married";"secondary";"no";-509;"yes";"no";"unknown";6;"may";124;1;-1;0;"unknown";"no"
-39;"unemployed";"married";"primary";"no";408;"yes";"no";"unknown";6;"may";53;1;-1;0;"unknown";"no"
-36;"services";"single";"primary";"no";58;"yes";"no";"unknown";6;"may";134;1;-1;0;"unknown";"no"
-57;"retired";"single";"secondary";"no";1640;"no";"yes";"unknown";6;"may";204;4;-1;0;"unknown";"no"
-36;"admin.";"single";"secondary";"no";20;"yes";"no";"unknown";6;"may";186;1;-1;0;"unknown";"no"
-50;"blue-collar";"married";"primary";"no";71;"yes";"no";"unknown";6;"may";678;1;-1;0;"unknown";"no"
-55;"blue-collar";"married";"secondary";"no";52;"yes";"no";"unknown";6;"may";182;1;-1;0;"unknown";"no"
-44;"self-employed";"married";"tertiary";"no";292;"yes";"no";"unknown";6;"may";162;1;-1;0;"unknown";"no"
-44;"services";"divorced";"secondary";"no";424;"yes";"no";"unknown";6;"may";27;1;-1;0;"unknown";"no"
-39;"housemaid";"single";"primary";"no";109;"yes";"no";"unknown";6;"may";699;3;-1;0;"unknown";"no"
-46;"blue-collar";"married";"unknown";"no";1044;"yes";"no";"unknown";6;"may";43;1;-1;0;"unknown";"no"
-39;"blue-collar";"married";"secondary";"no";983;"yes";"no";"unknown";6;"may";97;1;-1;0;"unknown";"no"
-34;"admin.";"married";"secondary";"no";869;"no";"no";"unknown";6;"may";1677;1;-1;0;"unknown";"yes"
-40;"blue-collar";"married";"primary";"no";668;"yes";"no";"unknown";6;"may";283;2;-1;0;"unknown";"no"
-50;"management";"married";"tertiary";"no";964;"yes";"no";"unknown";6;"may";323;1;-1;0;"unknown";"no"
-31;"management";"single";"secondary";"no";301;"yes";"no";"unknown";6;"may";82;1;-1;0;"unknown";"no"
-37;"admin.";"single";"secondary";"no";140;"yes";"no";"unknown";6;"may";310;1;-1;0;"unknown";"no"
-39;"management";"single";"secondary";"no";1877;"yes";"no";"unknown";6;"may";185;1;-1;0;"unknown";"no"
-51;"blue-collar";"married";"primary";"no";1127;"yes";"no";"unknown";6;"may";47;1;-1;0;"unknown";"no"
-41;"technician";"married";"secondary";"no";871;"yes";"no";"unknown";6;"may";145;1;-1;0;"unknown";"no"
-41;"technician";"married";"secondary";"no";767;"yes";"yes";"unknown";6;"may";204;1;-1;0;"unknown";"no"
-43;"blue-collar";"married";"secondary";"no";0;"no";"no";"unknown";6;"may";187;1;-1;0;"unknown";"no"
-30;"services";"single";"secondary";"no";209;"yes";"no";"unknown";6;"may";30;2;-1;0;"unknown";"no"
-54;"management";"divorced";"primary";"no";0;"no";"no";"unknown";6;"may";472;1;-1;0;"unknown";"no"
-43;"blue-collar";"divorced";"secondary";"no";110;"yes";"yes";"unknown";6;"may";448;1;-1;0;"unknown";"no"
-59;"management";"divorced";"tertiary";"no";-76;"yes";"yes";"unknown";6;"may";264;1;-1;0;"unknown";"no"
-47;"technician";"married";"unknown";"no";178;"yes";"no";"unknown";6;"may";169;1;-1;0;"unknown";"no"
-40;"blue-collar";"married";"primary";"no";-66;"yes";"no";"unknown";6;"may";288;1;-1;0;"unknown";"no"
-32;"technician";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";176;2;-1;0;"unknown";"no"
-29;"blue-collar";"married";"secondary";"no";1;"yes";"no";"unknown";6;"may";215;1;-1;0;"unknown";"no"
-36;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";337;1;-1;0;"unknown";"no"
-55;"unemployed";"married";"tertiary";"no";5345;"no";"no";"unknown";6;"may";278;1;-1;0;"unknown";"no"
-30;"blue-collar";"divorced";"secondary";"no";-209;"yes";"no";"unknown";6;"may";188;2;-1;0;"unknown";"no"
-39;"admin.";"single";"secondary";"no";0;"yes";"no";"unknown";6;"may";174;2;-1;0;"unknown";"no"
-39;"blue-collar";"divorced";"secondary";"no";42;"yes";"no";"unknown";6;"may";226;2;-1;0;"unknown";"no"
-50;"blue-collar";"divorced";"secondary";"no";41;"yes";"no";"unknown";6;"may";190;1;-1;0;"unknown";"no"
-44;"blue-collar";"married";"secondary";"no";-99;"yes";"no";"unknown";6;"may";111;2;-1;0;"unknown";"no"
-37;"technician";"single";"secondary";"no";17;"yes";"no";"unknown";6;"may";164;1;-1;0;"unknown";"no"
-46;"admin.";"married";"primary";"no";276;"yes";"yes";"unknown";6;"may";157;2;-1;0;"unknown";"no"
-32;"technician";"single";"unknown";"no";-170;"no";"no";"unknown";6;"may";46;1;-1;0;"unknown";"no"
-37;"management";"single";"tertiary";"no";230;"yes";"yes";"unknown";6;"may";374;1;-1;0;"unknown";"no"
-29;"blue-collar";"married";"secondary";"no";9;"yes";"no";"unknown";6;"may";349;1;-1;0;"unknown";"no"
-41;"blue-collar";"married";"secondary";"no";946;"yes";"no";"unknown";6;"may";325;1;-1;0;"unknown";"no"
-45;"blue-collar";"married";"primary";"no";1297;"yes";"no";"unknown";6;"may";233;2;-1;0;"unknown";"no"
-57;"retired";"divorced";"secondary";"no";-331;"yes";"no";"unknown";6;"may";531;1;-1;0;"unknown";"no"
-48;"blue-collar";"single";"secondary";"no";44;"yes";"no";"unknown";6;"may";153;1;-1;0;"unknown";"no"
-60;"retired";"married";"secondary";"yes";15;"no";"no";"unknown";6;"may";80;1;-1;0;"unknown";"no"
-26;"admin.";"single";"secondary";"no";712;"yes";"no";"unknown";6;"may";232;1;-1;0;"unknown";"no"
-58;"retired";"married";"secondary";"no";5435;"yes";"no";"unknown";6;"may";118;1;-1;0;"unknown";"no"
-34;"admin.";"married";"secondary";"no";507;"yes";"no";"unknown";6;"may";190;1;-1;0;"unknown";"no"
-55;"unemployed";"divorced";"secondary";"no";387;"yes";"no";"unknown";6;"may";918;1;-1;0;"unknown";"yes"
-41;"blue-collar";"married";"primary";"no";0;"yes";"yes";"unknown";6;"may";238;1;-1;0;"unknown";"no"
-50;"management";"divorced";"secondary";"no";1716;"yes";"no";"unknown";6;"may";82;1;-1;0;"unknown";"no"
-49;"entrepreneur";"married";"secondary";"no";167;"yes";"yes";"unknown";6;"may";198;3;-1;0;"unknown";"no"
-44;"admin.";"married";"unknown";"no";40;"no";"yes";"unknown";6;"may";160;2;-1;0;"unknown";"no"
-44;"blue-collar";"married";"primary";"no";148;"yes";"no";"unknown";6;"may";211;1;-1;0;"unknown";"no"
-31;"technician";"married";"secondary";"no";17;"yes";"yes";"unknown";6;"may";120;1;-1;0;"unknown";"no"
-34;"blue-collar";"single";"tertiary";"no";1011;"yes";"no";"unknown";6;"may";136;1;-1;0;"unknown";"no"
-46;"management";"single";"unknown";"no";1527;"yes";"no";"unknown";6;"may";269;1;-1;0;"unknown";"no"
-42;"management";"married";"tertiary";"no";744;"no";"no";"unknown";6;"may";157;1;-1;0;"unknown";"no"
-52;"admin.";"married";"secondary";"no";484;"yes";"no";"unknown";6;"may";128;1;-1;0;"unknown";"no"
-29;"management";"single";"tertiary";"no";0;"yes";"no";"unknown";6;"may";211;1;-1;0;"unknown";"no"
-53;"retired";"married";"primary";"no";136;"yes";"no";"unknown";6;"may";267;2;-1;0;"unknown";"no"
-43;"blue-collar";"married";"secondary";"no";1335;"yes";"no";"unknown";6;"may";371;2;-1;0;"unknown";"no"
-38;"management";"married";"secondary";"no";517;"yes";"no";"unknown";6;"may";288;2;-1;0;"unknown";"no"
-46;"management";"married";"tertiary";"no";459;"yes";"no";"unknown";6;"may";221;1;-1;0;"unknown";"no"
-48;"management";"divorced";"unknown";"no";549;"yes";"no";"unknown";6;"may";427;1;-1;0;"unknown";"no"
-30;"admin.";"divorced";"secondary";"no";83;"yes";"yes";"unknown";6;"may";310;1;-1;0;"unknown";"no"
-44;"blue-collar";"married";"primary";"no";213;"no";"no";"unknown";6;"may";158;1;-1;0;"unknown";"no"
-31;"housemaid";"married";"primary";"no";203;"yes";"no";"unknown";6;"may";604;3;-1;0;"unknown";"no"
-42;"services";"single";"secondary";"no";518;"yes";"no";"unknown";6;"may";198;1;-1;0;"unknown";"no"
-40;"management";"single";"tertiary";"no";3877;"yes";"no";"unknown";6;"may";145;1;-1;0;"unknown";"no"
-52;"admin.";"married";"secondary";"no";1236;"yes";"no";"unknown";6;"may";247;1;-1;0;"unknown";"no"
-45;"blue-collar";"divorced";"secondary";"no";756;"yes";"no";"unknown";6;"may";179;2;-1;0;"unknown";"no"
-48;"blue-collar";"married";"secondary";"no";157;"yes";"no";"unknown";6;"may";73;1;-1;0;"unknown";"no"
-45;"blue-collar";"married";"primary";"no";-66;"yes";"no";"unknown";6;"may";263;2;-1;0;"unknown";"no"
-34;"blue-collar";"married";"unknown";"no";245;"yes";"no";"unknown";6;"may";13;1;-1;0;"unknown";"no"
-34;"blue-collar";"married";"primary";"no";-144;"yes";"no";"unknown";6;"may";79;2;-1;0;"unknown";"no"
-46;"blue-collar";"married";"secondary";"no";71;"yes";"no";"unknown";6;"may";416;1;-1;0;"unknown";"no"
-49;"services";"divorced";"secondary";"no";505;"yes";"no";"unknown";6;"may";162;1;-1;0;"unknown";"no"
-50;"technician";"married";"primary";"no";249;"yes";"no";"unknown";6;"may";129;1;-1;0;"unknown";"no"
-34;"admin.";"single";"secondary";"no";0;"yes";"no";"unknown";6;"may";150;1;-1;0;"unknown";"no"
-40;"unemployed";"single";"secondary";"no";11;"yes";"no";"unknown";6;"may";43;1;-1;0;"unknown";"no"
-36;"admin.";"married";"secondary";"no";639;"yes";"no";"unknown";6;"may";191;1;-1;0;"unknown";"no"
-59;"blue-collar";"divorced";"unknown";"no";124;"yes";"no";"unknown";6;"may";26;1;-1;0;"unknown";"no"
-45;"blue-collar";"married";"secondary";"no";82;"yes";"no";"unknown";6;"may";250;1;-1;0;"unknown";"no"
-36;"self-employed";"married";"tertiary";"no";107;"yes";"no";"unknown";6;"may";146;1;-1;0;"unknown";"no"
-56;"services";"married";"secondary";"no";473;"yes";"no";"unknown";6;"may";416;1;-1;0;"unknown";"no"
-42;"services";"divorced";"secondary";"no";372;"yes";"yes";"unknown";6;"may";121;2;-1;0;"unknown";"no"
-30;"admin.";"married";"secondary";"no";46;"yes";"no";"unknown";6;"may";114;2;-1;0;"unknown";"no"
-30;"student";"single";"tertiary";"no";34;"yes";"no";"unknown";6;"may";289;1;-1;0;"unknown";"no"
-47;"self-employed";"married";"unknown";"no";935;"yes";"no";"unknown";6;"may";225;1;-1;0;"unknown";"no"
-33;"blue-collar";"married";"secondary";"no";-10;"yes";"no";"unknown";6;"may";123;1;-1;0;"unknown";"no"
-36;"admin.";"married";"secondary";"no";-106;"yes";"no";"unknown";6;"may";130;2;-1;0;"unknown";"no"
-39;"services";"divorced";"primary";"no";471;"yes";"no";"unknown";6;"may";161;2;-1;0;"unknown";"no"
-56;"admin.";"divorced";"secondary";"no";778;"yes";"no";"unknown";6;"may";149;2;-1;0;"unknown";"no"
-39;"blue-collar";"divorced";"unknown";"no";170;"yes";"no";"unknown";6;"may";268;2;-1;0;"unknown";"no"
-42;"technician";"married";"secondary";"no";315;"yes";"no";"unknown";6;"may";259;2;-1;0;"unknown";"no"
-52;"blue-collar";"married";"secondary";"no";3165;"no";"no";"unknown";6;"may";26;1;-1;0;"unknown";"no"
-36;"admin.";"divorced";"secondary";"no";131;"yes";"no";"unknown";6;"may";153;1;-1;0;"unknown";"no"
-35;"entrepreneur";"married";"secondary";"yes";204;"yes";"no";"unknown";6;"may";424;2;-1;0;"unknown";"no"
-47;"technician";"married";"secondary";"no";83;"yes";"no";"unknown";6;"may";179;2;-1;0;"unknown";"no"
-59;"services";"divorced";"secondary";"no";0;"yes";"yes";"unknown";6;"may";97;1;-1;0;"unknown";"no"
-57;"blue-collar";"married";"primary";"no";5431;"yes";"yes";"unknown";6;"may";383;1;-1;0;"unknown";"no"
-38;"management";"married";"unknown";"no";1759;"yes";"no";"unknown";6;"may";440;1;-1;0;"unknown";"no"
-46;"unemployed";"married";"secondary";"no";-125;"yes";"no";"unknown";6;"may";23;1;-1;0;"unknown";"no"
-34;"blue-collar";"married";"secondary";"no";0;"no";"no";"unknown";6;"may";195;1;-1;0;"unknown";"no"
-28;"services";"single";"secondary";"no";5090;"yes";"no";"unknown";6;"may";1297;3;-1;0;"unknown";"yes"
-38;"technician";"married";"unknown";"no";573;"yes";"no";"unknown";6;"may";87;1;-1;0;"unknown";"no"
-56;"blue-collar";"married";"secondary";"no";1602;"yes";"no";"unknown";6;"may";427;1;-1;0;"unknown";"no"
-41;"blue-collar";"single";"primary";"yes";-137;"yes";"yes";"unknown";6;"may";189;1;-1;0;"unknown";"no"
-52;"technician";"married";"unknown";"no";0;"no";"no";"unknown";6;"may";195;1;-1;0;"unknown";"no"
-54;"services";"married";"secondary";"no";193;"no";"no";"unknown";6;"may";179;1;-1;0;"unknown";"no"
-61;"retired";"married";"secondary";"no";195;"yes";"yes";"unknown";6;"may";179;1;-1;0;"unknown";"no"
-53;"entrepreneur";"married";"secondary";"no";288;"no";"no";"unknown";6;"may";69;1;-1;0;"unknown";"no"
-47;"technician";"married";"secondary";"no";19;"yes";"no";"unknown";6;"may";105;2;-1;0;"unknown";"no"
-53;"blue-collar";"married";"primary";"no";25;"yes";"no";"unknown";6;"may";266;3;-1;0;"unknown";"no"
-46;"services";"married";"secondary";"no";216;"yes";"no";"unknown";6;"may";524;2;-1;0;"unknown";"no"
-39;"blue-collar";"divorced";"primary";"no";190;"yes";"yes";"unknown";6;"may";96;2;-1;0;"unknown";"no"
-56;"technician";"divorced";"secondary";"no";99;"yes";"no";"unknown";6;"may";155;2;-1;0;"unknown";"no"
-55;"services";"divorced";"primary";"no";2298;"yes";"no";"unknown";6;"may";162;2;-1;0;"unknown";"no"
-44;"management";"married";"tertiary";"no";17;"yes";"no";"unknown";6;"may";352;2;-1;0;"unknown";"no"
-37;"technician";"married";"primary";"no";0;"yes";"no";"unknown";6;"may";76;4;-1;0;"unknown";"no"
-35;"blue-collar";"married";"primary";"no";0;"yes";"no";"unknown";6;"may";154;2;-1;0;"unknown";"no"
-55;"blue-collar";"married";"secondary";"no";840;"yes";"no";"unknown";6;"may";310;2;-1;0;"unknown";"no"
-37;"services";"married";"secondary";"no";358;"yes";"no";"unknown";6;"may";390;3;-1;0;"unknown";"no"
-30;"technician";"single";"secondary";"no";0;"yes";"no";"unknown";6;"may";369;1;-1;0;"unknown";"no"
-37;"blue-collar";"married";"primary";"no";-325;"yes";"yes";"unknown";6;"may";112;2;-1;0;"unknown";"no"
-36;"technician";"single";"secondary";"no";-15;"yes";"no";"unknown";6;"may";341;3;-1;0;"unknown";"no"
-38;"technician";"married";"secondary";"no";581;"yes";"no";"unknown";6;"may";79;1;-1;0;"unknown";"no"
-41;"admin.";"divorced";"primary";"no";4070;"yes";"no";"unknown";6;"may";140;2;-1;0;"unknown";"no"
-48;"retired";"married";"secondary";"no";74;"no";"yes";"unknown";6;"may";315;1;-1;0;"unknown";"no"
-55;"services";"divorced";"secondary";"no";141;"yes";"no";"unknown";6;"may";262;2;-1;0;"unknown";"no"
-28;"services";"divorced";"secondary";"no";89;"no";"no";"unknown";6;"may";174;2;-1;0;"unknown";"no"
-54;"services";"married";"secondary";"yes";0;"yes";"no";"unknown";6;"may";138;3;-1;0;"unknown";"no"
-30;"blue-collar";"married";"secondary";"no";450;"no";"no";"unknown";6;"may";526;2;-1;0;"unknown";"no"
-48;"technician";"married";"tertiary";"no";310;"no";"no";"unknown";6;"may";135;1;-1;0;"unknown";"no"
-31;"self-employed";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";36;5;-1;0;"unknown";"no"
-38;"blue-collar";"married";"secondary";"no";384;"yes";"no";"unknown";6;"may";1906;3;-1;0;"unknown";"no"
-37;"blue-collar";"married";"secondary";"no";395;"yes";"no";"unknown";6;"may";219;2;-1;0;"unknown";"no"
-37;"services";"single";"unknown";"no";-118;"yes";"no";"unknown";6;"may";147;2;-1;0;"unknown";"no"
-56;"blue-collar";"married";"primary";"no";5;"yes";"yes";"unknown";6;"may";407;2;-1;0;"unknown";"no"
-51;"blue-collar";"married";"secondary";"no";50;"yes";"yes";"unknown";6;"may";121;1;-1;0;"unknown";"no"
-39;"blue-collar";"married";"secondary";"no";285;"yes";"yes";"unknown";6;"may";209;1;-1;0;"unknown";"no"
-49;"technician";"married";"unknown";"no";15;"no";"no";"unknown";6;"may";92;2;-1;0;"unknown";"no"
-51;"blue-collar";"married";"primary";"no";653;"yes";"yes";"unknown";6;"may";208;1;-1;0;"unknown";"no"
-43;"self-employed";"married";"secondary";"no";918;"yes";"no";"unknown";6;"may";193;1;-1;0;"unknown";"no"
-32;"services";"married";"secondary";"no";243;"yes";"yes";"unknown";6;"may";144;1;-1;0;"unknown";"no"
-29;"technician";"single";"tertiary";"no";405;"yes";"no";"unknown";6;"may";65;1;-1;0;"unknown";"no"
-48;"management";"divorced";"tertiary";"no";1328;"yes";"no";"unknown";6;"may";339;1;-1;0;"unknown";"no"
-55;"services";"married";"primary";"no";255;"yes";"no";"unknown";6;"may";285;1;-1;0;"unknown";"no"
-53;"blue-collar";"married";"secondary";"no";3397;"yes";"no";"unknown";6;"may";231;1;-1;0;"unknown";"no"
-47;"technician";"married";"unknown";"no";2106;"yes";"no";"unknown";6;"may";168;1;-1;0;"unknown";"no"
-39;"management";"married";"tertiary";"no";2877;"yes";"no";"unknown";6;"may";278;1;-1;0;"unknown";"no"
-31;"blue-collar";"single";"tertiary";"no";60;"yes";"yes";"unknown";6;"may";389;1;-1;0;"unknown";"no"
-39;"blue-collar";"married";"primary";"no";2226;"yes";"no";"unknown";6;"may";158;1;-1;0;"unknown";"no"
-40;"blue-collar";"married";"primary";"no";2880;"yes";"no";"unknown";6;"may";145;2;-1;0;"unknown";"no"
-40;"technician";"single";"unknown";"no";-5;"yes";"no";"unknown";6;"may";78;2;-1;0;"unknown";"no"
-48;"technician";"married";"secondary";"no";147;"no";"no";"unknown";6;"may";142;3;-1;0;"unknown";"no"
-33;"technician";"divorced";"secondary";"no";7;"yes";"yes";"unknown";6;"may";87;1;-1;0;"unknown";"no"
-40;"technician";"married";"secondary";"no";109;"yes";"no";"unknown";6;"may";147;2;-1;0;"unknown";"no"
-59;"retired";"married";"primary";"no";-119;"yes";"no";"unknown";6;"may";289;1;-1;0;"unknown";"no"
-30;"technician";"married";"secondary";"no";484;"yes";"no";"unknown";6;"may";703;1;-1;0;"unknown";"yes"
-31;"management";"single";"tertiary";"no";1852;"yes";"no";"unknown";6;"may";170;3;-1;0;"unknown";"no"
-35;"unemployed";"married";"secondary";"no";533;"yes";"no";"unknown";6;"may";802;1;-1;0;"unknown";"no"
-54;"technician";"divorced";"secondary";"no";21;"yes";"no";"unknown";6;"may";381;2;-1;0;"unknown";"no"
-34;"admin.";"single";"unknown";"no";2434;"yes";"no";"unknown";6;"may";218;4;-1;0;"unknown";"no"
-32;"technician";"married";"secondary";"no";90;"yes";"yes";"unknown";6;"may";57;2;-1;0;"unknown";"no"
-56;"admin.";"divorced";"unknown";"no";4246;"yes";"no";"unknown";6;"may";304;2;-1;0;"unknown";"no"
-32;"admin.";"single";"tertiary";"no";395;"yes";"no";"unknown";6;"may";241;3;-1;0;"unknown";"no"
-42;"blue-collar";"married";"primary";"no";15;"yes";"no";"unknown";6;"may";230;1;-1;0;"unknown";"no"
-33;"services";"married";"tertiary";"no";85;"no";"no";"unknown";6;"may";262;3;-1;0;"unknown";"no"
-52;"entrepreneur";"married";"tertiary";"no";-184;"yes";"yes";"unknown";6;"may";392;2;-1;0;"unknown";"no"
-52;"services";"married";"secondary";"no";660;"no";"no";"unknown";6;"may";201;2;-1;0;"unknown";"no"
-52;"blue-collar";"divorced";"primary";"yes";-183;"yes";"no";"unknown";6;"may";145;1;-1;0;"unknown";"no"
-30;"unemployed";"divorced";"secondary";"no";1144;"yes";"no";"unknown";6;"may";252;1;-1;0;"unknown";"no"
-44;"services";"divorced";"secondary";"no";1;"yes";"no";"unknown";6;"may";235;4;-1;0;"unknown";"no"
-35;"admin.";"married";"secondary";"no";69;"yes";"yes";"unknown";6;"may";235;2;-1;0;"unknown";"no"
-55;"management";"single";"secondary";"no";220;"yes";"no";"unknown";6;"may";328;2;-1;0;"unknown";"no"
-33;"blue-collar";"married";"primary";"no";332;"yes";"no";"unknown";6;"may";116;2;-1;0;"unknown";"no"
-37;"blue-collar";"single";"secondary";"no";240;"yes";"no";"unknown";6;"may";246;1;-1;0;"unknown";"no"
-42;"technician";"single";"secondary";"no";0;"yes";"no";"unknown";6;"may";293;1;-1;0;"unknown";"no"
-43;"unemployed";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";37;2;-1;0;"unknown";"no"
-38;"entrepreneur";"married";"tertiary";"no";898;"yes";"no";"unknown";6;"may";132;2;-1;0;"unknown";"no"
-37;"technician";"married";"secondary";"no";123;"yes";"yes";"unknown";6;"may";530;2;-1;0;"unknown";"no"
-31;"student";"single";"secondary";"no";252;"yes";"no";"unknown";6;"may";175;3;-1;0;"unknown";"no"
-41;"management";"married";"tertiary";"no";65;"yes";"no";"unknown";6;"may";524;2;-1;0;"unknown";"no"
-41;"technician";"married";"secondary";"no";-366;"yes";"yes";"unknown";6;"may";29;3;-1;0;"unknown";"no"
-29;"student";"single";"secondary";"no";209;"yes";"no";"unknown";6;"may";311;2;-1;0;"unknown";"no"
-38;"admin.";"single";"secondary";"no";221;"yes";"no";"unknown";6;"may";211;2;-1;0;"unknown";"no"
-44;"self-employed";"divorced";"tertiary";"no";4;"yes";"no";"unknown";6;"may";312;3;-1;0;"unknown";"no"
-39;"admin.";"married";"secondary";"no";104;"yes";"no";"unknown";6;"may";412;1;-1;0;"unknown";"no"
-28;"technician";"single";"secondary";"no";312;"yes";"no";"unknown";6;"may";392;1;-1;0;"unknown";"no"
-33;"blue-collar";"married";"secondary";"no";-349;"yes";"no";"unknown";6;"may";191;1;-1;0;"unknown";"no"
-41;"services";"married";"unknown";"no";4;"no";"no";"unknown";6;"may";284;2;-1;0;"unknown";"no"
-40;"blue-collar";"married";"primary";"no";-322;"yes";"yes";"unknown";6;"may";144;1;-1;0;"unknown";"no"
-29;"admin.";"married";"secondary";"no";-150;"yes";"no";"unknown";6;"may";328;1;-1;0;"unknown";"no"
-38;"management";"married";"unknown";"no";1349;"yes";"no";"unknown";6;"may";100;1;-1;0;"unknown";"no"
-32;"admin.";"married";"tertiary";"no";281;"yes";"no";"unknown";6;"may";226;1;-1;0;"unknown";"no"
-45;"services";"married";"secondary";"no";1259;"yes";"no";"unknown";6;"may";507;1;-1;0;"unknown";"no"
-33;"admin.";"single";"secondary";"no";101;"yes";"no";"unknown";6;"may";392;1;-1;0;"unknown";"no"
-34;"blue-collar";"married";"secondary";"no";848;"yes";"no";"unknown";6;"may";684;2;-1;0;"unknown";"no"
-41;"entrepreneur";"married";"unknown";"no";89;"yes";"no";"unknown";6;"may";333;2;-1;0;"unknown";"no"
-41;"blue-collar";"married";"secondary";"no";140;"yes";"no";"unknown";6;"may";311;3;-1;0;"unknown";"no"
-35;"admin.";"single";"secondary";"no";148;"yes";"no";"unknown";6;"may";128;2;-1;0;"unknown";"no"
-40;"technician";"single";"secondary";"no";200;"yes";"no";"unknown";6;"may";322;2;-1;0;"unknown";"no"
-60;"self-employed";"married";"primary";"no";46;"yes";"no";"unknown";6;"may";202;4;-1;0;"unknown";"no"
-47;"services";"divorced";"secondary";"no";201;"yes";"no";"unknown";6;"may";92;2;-1;0;"unknown";"no"
-46;"blue-collar";"married";"primary";"no";530;"yes";"no";"unknown";6;"may";739;3;-1;0;"unknown";"no"
-31;"blue-collar";"single";"secondary";"no";0;"yes";"no";"unknown";6;"may";273;2;-1;0;"unknown";"no"
-49;"self-employed";"married";"secondary";"no";1;"yes";"no";"unknown";6;"may";260;3;-1;0;"unknown";"no"
-29;"blue-collar";"married";"secondary";"no";43;"yes";"no";"unknown";6;"may";268;2;-1;0;"unknown";"no"
-31;"management";"single";"tertiary";"no";-173;"yes";"no";"unknown";6;"may";396;2;-1;0;"unknown";"no"
-38;"management";"married";"tertiary";"no";389;"yes";"no";"unknown";6;"may";262;1;-1;0;"unknown";"no"
-37;"blue-collar";"married";"primary";"no";215;"yes";"yes";"unknown";6;"may";308;3;-1;0;"unknown";"no"
-35;"technician";"married";"secondary";"no";-131;"yes";"no";"unknown";6;"may";467;2;-1;0;"unknown";"no"
-31;"management";"single";"secondary";"no";783;"yes";"no";"unknown";6;"may";320;1;-1;0;"unknown";"no"
-41;"admin.";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";160;3;-1;0;"unknown";"no"
-46;"services";"married";"unknown";"no";80;"yes";"no";"unknown";6;"may";245;2;-1;0;"unknown";"no"
-40;"services";"divorced";"secondary";"no";105;"yes";"no";"unknown";6;"may";189;2;-1;0;"unknown";"no"
-29;"admin.";"married";"secondary";"no";182;"yes";"yes";"unknown";6;"may";477;1;-1;0;"unknown";"no"
-49;"admin.";"married";"secondary";"no";82;"yes";"no";"unknown";6;"may";310;1;-1;0;"unknown";"no"
-42;"management";"married";"tertiary";"no";0;"yes";"no";"unknown";6;"may";65;3;-1;0;"unknown";"no"
-54;"services";"married";"secondary";"no";510;"yes";"no";"unknown";6;"may";196;2;-1;0;"unknown";"no"
-40;"management";"single";"tertiary";"no";242;"yes";"no";"unknown";6;"may";221;2;-1;0;"unknown";"no"
-53;"admin.";"married";"secondary";"no";244;"yes";"yes";"unknown";6;"may";197;2;-1;0;"unknown";"no"
-49;"management";"married";"tertiary";"no";92;"yes";"no";"unknown";6;"may";221;2;-1;0;"unknown";"no"
-40;"blue-collar";"married";"primary";"yes";0;"yes";"no";"unknown";6;"may";64;2;-1;0;"unknown";"no"
-29;"student";"single";"secondary";"no";948;"yes";"no";"unknown";6;"may";75;2;-1;0;"unknown";"no"
-36;"blue-collar";"married";"primary";"no";23;"yes";"no";"unknown";6;"may";400;2;-1;0;"unknown";"no"
-37;"technician";"married";"secondary";"no";710;"yes";"no";"unknown";6;"may";378;3;-1;0;"unknown";"no"
-39;"services";"married";"secondary";"no";1205;"yes";"no";"unknown";6;"may";118;2;-1;0;"unknown";"no"
-36;"technician";"married";"secondary";"no";368;"yes";"yes";"unknown";6;"may";1597;2;-1;0;"unknown";"yes"
-44;"entrepreneur";"married";"tertiary";"no";1631;"yes";"no";"unknown";6;"may";346;2;-1;0;"unknown";"no"
-40;"admin.";"married";"secondary";"no";6;"yes";"no";"unknown";6;"may";60;3;-1;0;"unknown";"no"
-49;"blue-collar";"married";"secondary";"no";26;"yes";"no";"unknown";6;"may";276;2;-1;0;"unknown";"no"
-30;"technician";"single";"unknown";"no";-48;"yes";"no";"unknown";6;"may";152;2;-1;0;"unknown";"no"
-57;"management";"married";"tertiary";"no";2142;"yes";"no";"unknown";6;"may";251;3;-1;0;"unknown";"no"
-24;"services";"single";"secondary";"no";77;"yes";"yes";"unknown";6;"may";390;2;-1;0;"unknown";"no"
-46;"blue-collar";"married";"unknown";"no";401;"yes";"no";"unknown";6;"may";306;2;-1;0;"unknown";"no"
-33;"admin.";"married";"secondary";"no";21;"no";"no";"unknown";6;"may";189;3;-1;0;"unknown";"no"
-43;"services";"divorced";"secondary";"no";0;"yes";"no";"unknown";6;"may";125;2;-1;0;"unknown";"no"
-43;"admin.";"single";"secondary";"no";-497;"yes";"no";"unknown";6;"may";234;2;-1;0;"unknown";"no"
-40;"blue-collar";"divorced";"primary";"no";369;"no";"no";"unknown";6;"may";79;2;-1;0;"unknown";"no"
-44;"technician";"single";"unknown";"no";78;"yes";"no";"unknown";6;"may";13;6;-1;0;"unknown";"no"
-35;"technician";"single";"tertiary";"no";226;"yes";"yes";"unknown";6;"may";283;3;-1;0;"unknown";"no"
-47;"technician";"married";"secondary";"no";503;"yes";"no";"unknown";6;"may";109;2;-1;0;"unknown";"no"
-33;"blue-collar";"married";"secondary";"no";372;"yes";"no";"unknown";6;"may";132;2;-1;0;"unknown";"no"
-31;"admin.";"married";"secondary";"no";0;"yes";"yes";"unknown";6;"may";144;2;-1;0;"unknown";"no"
-40;"blue-collar";"divorced";"secondary";"no";0;"yes";"no";"unknown";6;"may";121;2;-1;0;"unknown";"no"
-36;"entrepreneur";"married";"tertiary";"no";125;"yes";"no";"unknown";6;"may";95;3;-1;0;"unknown";"no"
-56;"retired";"divorced";"primary";"no";4;"yes";"no";"unknown";6;"may";31;3;-1;0;"unknown";"no"
-40;"admin.";"single";"unknown";"no";419;"yes";"no";"unknown";6;"may";112;3;-1;0;"unknown";"no"
-41;"admin.";"divorced";"secondary";"no";322;"yes";"no";"unknown";6;"may";87;4;-1;0;"unknown";"no"
-53;"retired";"married";"secondary";"no";303;"yes";"no";"unknown";6;"may";593;2;-1;0;"unknown";"no"
-39;"blue-collar";"married";"secondary";"no";607;"yes";"no";"unknown";6;"may";99;2;-1;0;"unknown";"no"
-44;"blue-collar";"divorced";"secondary";"no";579;"yes";"no";"unknown";6;"may";198;2;-1;0;"unknown";"no"
-38;"admin.";"married";"secondary";"no";3047;"yes";"no";"unknown";6;"may";285;2;-1;0;"unknown";"no"
-54;"technician";"divorced";"secondary";"no";83;"yes";"no";"unknown";6;"may";190;3;-1;0;"unknown";"no"
-58;"management";"married";"tertiary";"no";68;"yes";"no";"unknown";6;"may";172;5;-1;0;"unknown";"no"
-52;"blue-collar";"married";"primary";"no";58;"yes";"no";"unknown";6;"may";213;3;-1;0;"unknown";"no"
-28;"admin.";"single";"secondary";"no";251;"yes";"no";"unknown";6;"may";178;2;-1;0;"unknown";"no"
-36;"blue-collar";"married";"secondary";"no";688;"yes";"no";"unknown";6;"may";174;2;-1;0;"unknown";"no"
-60;"retired";"married";"primary";"no";364;"yes";"no";"unknown";6;"may";631;2;-1;0;"unknown";"no"
-42;"services";"divorced";"secondary";"no";55;"yes";"no";"unknown";6;"may";176;5;-1;0;"unknown";"no"
-42;"admin.";"married";"secondary";"no";101;"yes";"no";"unknown";6;"may";32;3;-1;0;"unknown";"no"
-44;"management";"married";"tertiary";"no";105;"yes";"no";"unknown";6;"may";1529;2;-1;0;"unknown";"no"
-51;"blue-collar";"divorced";"primary";"no";325;"yes";"no";"unknown";6;"may";254;2;-1;0;"unknown";"no"
-49;"blue-collar";"married";"primary";"no";198;"yes";"no";"unknown";6;"may";200;2;-1;0;"unknown";"no"
-47;"entrepreneur";"married";"unknown";"no";209;"yes";"no";"unknown";6;"may";135;2;-1;0;"unknown";"no"
-37;"blue-collar";"married";"secondary";"no";183;"yes";"no";"unknown";6;"may";112;4;-1;0;"unknown";"no"
-34;"management";"married";"tertiary";"no";105;"yes";"no";"unknown";6;"may";314;3;-1;0;"unknown";"no"
-35;"services";"married";"secondary";"no";109;"yes";"no";"unknown";6;"may";597;3;-1;0;"unknown";"no"
-35;"blue-collar";"single";"secondary";"no";376;"yes";"yes";"unknown";6;"may";207;3;-1;0;"unknown";"no"
-40;"blue-collar";"married";"primary";"no";-7;"yes";"no";"unknown";6;"may";410;2;-1;0;"unknown";"no"
-55;"technician";"married";"secondary";"no";0;"no";"no";"unknown";6;"may";160;3;-1;0;"unknown";"no"
-55;"retired";"married";"secondary";"no";143;"yes";"no";"unknown";6;"may";42;3;-1;0;"unknown";"no"
-35;"management";"single";"tertiary";"no";550;"yes";"no";"unknown";6;"may";55;2;-1;0;"unknown";"no"
-57;"blue-collar";"married";"primary";"no";162;"yes";"no";"unknown";6;"may";155;2;-1;0;"unknown";"no"
-53;"management";"married";"tertiary";"no";115;"yes";"no";"unknown";6;"may";336;3;-1;0;"unknown";"no"
-41;"blue-collar";"married";"primary";"no";512;"yes";"no";"unknown";6;"may";233;2;-1;0;"unknown";"no"
-57;"blue-collar";"married";"unknown";"no";807;"yes";"no";"unknown";6;"may";211;2;-1;0;"unknown";"no"
-45;"blue-collar";"married";"unknown";"no";248;"yes";"no";"unknown";6;"may";88;5;-1;0;"unknown";"no"
-43;"blue-collar";"married";"primary";"no";1211;"yes";"no";"unknown";6;"may";208;3;-1;0;"unknown";"no"
-56;"self-employed";"married";"unknown";"no";7;"no";"no";"unknown";6;"may";305;2;-1;0;"unknown";"no"
-31;"entrepreneur";"married";"tertiary";"no";281;"yes";"no";"unknown";6;"may";206;2;-1;0;"unknown";"no"
-37;"blue-collar";"single";"secondary";"no";88;"yes";"no";"unknown";6;"may";128;2;-1;0;"unknown";"no"
-30;"management";"married";"tertiary";"no";32;"yes";"no";"unknown";6;"may";122;3;-1;0;"unknown";"no"
-30;"admin.";"single";"secondary";"no";115;"yes";"no";"unknown";6;"may";66;3;-1;0;"unknown";"no"
-54;"blue-collar";"married";"secondary";"no";254;"yes";"no";"unknown";6;"may";66;2;-1;0;"unknown";"no"
-36;"management";"married";"tertiary";"no";144;"yes";"no";"unknown";6;"may";164;2;-1;0;"unknown";"no"
-55;"unemployed";"married";"tertiary";"no";383;"no";"no";"unknown";6;"may";343;3;-1;0;"unknown";"no"
-37;"admin.";"single";"secondary";"no";569;"yes";"yes";"unknown";6;"may";126;2;-1;0;"unknown";"no"
-38;"housemaid";"married";"secondary";"no";0;"yes";"no";"unknown";6;"may";59;3;-1;0;"unknown";"no"
-48;"admin.";"married";"secondary";"no";3754;"yes";"no";"unknown";6;"may";249;3;-1;0;"unknown";"no"
-55;"housemaid";"divorced";"tertiary";"no";6920;"yes";"no";"unknown";6;"may";406;3;-1;0;"unknown";"no"
-59;"services";"married";"secondary";"no";307;"yes";"yes";"unknown";6;"may";250;7;-1;0;"unknown";"no"
-37;"technician";"married";"secondary";"no";-421;"yes";"no";"unknown";6;"may";183;5;-1;0;"unknown";"no"
-33;"blue-collar";"divorced";"secondary";"no";60;"no";"no";"unknown";6;"may";190;3;-1;0;"unknown";"no"
-44;"blue-collar";"married";"primary";"no";67;"yes";"no";"unknown";6;"may";220;2;-1;0;"unknown";"no"
-57;"technician";"married";"secondary";"no";402;"yes";"no";"unknown";6;"may";153;3;-1;0;"unknown";"no"
-30;"self-employed";"single";"tertiary";"no";800;"no";"no";"unknown";6;"may";95;2;-1;0;"unknown";"no"
-42;"technician";"married";"tertiary";"no";239;"yes";"yes";"unknown";6;"may";191;3;-1;0;"unknown";"no"
-51;"blue-collar";"divorced";"secondary";"no";421;"yes";"no";"unknown";6;"may";216;2;-1;0;"unknown";"no"
-44;"admin.";"divorced";"secondary";"no";161;"yes";"no";"unknown";7;"may";89;2;-1;0;"unknown";"no"
-46;"technician";"married";"secondary";"yes";289;"no";"no";"unknown";7;"may";51;3;-1;0;"unknown";"no"
-29;"student";"single";"secondary";"no";110;"yes";"no";"unknown";7;"may";169;3;-1;0;"unknown";"no"
-39;"blue-collar";"married";"primary";"no";245;"yes";"no";"unknown";7;"may";148;3;-1;0;"unknown";"no"
-42;"services";"married";"secondary";"no";0;"yes";"no";"unknown";7;"may";132;3;-1;0;"unknown";"no"
-50;"blue-collar";"married";"primary";"no";156;"yes";"no";"unknown";7;"may";117;3;-1;0;"unknown";"no"
-42;"technician";"single";"secondary";"no";0;"yes";"no";"unknown";7;"may";275;4;-1;0;"unknown";"no"
-39;"admin.";"married";"secondary";"no";20;"yes";"no";"unknown";7;"may";124;2;-1;0;"unknown";"no"
-55;"technician";"single";"tertiary";"no";92;"yes";"no";"unknown";7;"may";118;3;-1;0;"unknown";"no"
-46;"services";"married";"secondary";"no";89;"yes";"no";"unknown";7;"may";479;2;-1;0;"unknown";"no"
-42;"blue-collar";"married";"secondary";"no";166;"yes";"no";"unknown";7;"may";285;3;-1;0;"unknown";"no"
-45;"management";"married";"tertiary";"no";103;"yes";"no";"unknown";7;"may";35;4;-1;0;"unknown";"no"
-43;"blue-collar";"married";"primary";"no";-454;"yes";"no";"unknown";7;"may";322;2;-1;0;"unknown";"no"
-42;"admin.";"married";"secondary";"no";445;"yes";"no";"unknown";7;"may";202;2;-1;0;"unknown";"no"
-30;"admin.";"married";"secondary";"no";4;"no";"no";"unknown";7;"may";172;8;-1;0;"unknown";"no"
-47;"blue-collar";"married";"secondary";"no";1001;"yes";"no";"unknown";7;"may";201;4;-1;0;"unknown";"no"
-51;"services";"divorced";"secondary";"no";-69;"yes";"no";"unknown";7;"may";216;3;-1;0;"unknown";"no"
-38;"technician";"single";"secondary";"no";42;"yes";"no";"unknown";7;"may";195;2;-1;0;"unknown";"no"
-57;"technician";"married";"unknown";"no";1617;"yes";"no";"unknown";7;"may";96;2;-1;0;"unknown";"no"
-42;"management";"divorced";"tertiary";"no";221;"yes";"no";"unknown";7;"may";720;2;-1;0;"unknown";"no"
-32;"technician";"divorced";"secondary";"no";210;"yes";"yes";"unknown";7;"may";188;2;-1;0;"unknown";"no"
-46;"management";"married";"tertiary";"no";0;"no";"no";"unknown";7;"may";70;2;-1;0;"unknown";"no"
-29;"student";"single";"tertiary";"no";185;"yes";"no";"unknown";7;"may";141;3;-1;0;"unknown";"no"
-59;"retired";"married";"secondary";"no";836;"yes";"no";"unknown";7;"may";106;1;-1;0;"unknown";"no"
-32;"blue-collar";"single";"secondary";"no";301;"yes";"no";"unknown";7;"may";395;2;-1;0;"unknown";"no"
-44;"blue-collar";"married";"primary";"no";503;"yes";"no";"unknown";7;"may";629;2;-1;0;"unknown";"no"
-40;"retired";"married";"primary";"no";407;"yes";"no";"unknown";7;"may";502;1;-1;0;"unknown";"no"
-31;"blue-collar";"single";"secondary";"no";53;"yes";"no";"unknown";7;"may";446;1;-1;0;"unknown";"no"
-46;"self-employed";"married";"tertiary";"no";2303;"yes";"no";"unknown";7;"may";241;1;-1;0;"unknown";"no"
-43;"management";"married";"tertiary";"no";144;"yes";"no";"unknown";7;"may";131;3;-1;0;"unknown";"no"
-34;"services";"married";"secondary";"no";205;"yes";"no";"unknown";7;"may";312;1;-1;0;"unknown";"no"
-39;"management";"married";"tertiary";"no";305;"yes";"no";"unknown";7;"may";275;6;-1;0;"unknown";"no"
-30;"blue-collar";"divorced";"secondary";"no";251;"yes";"yes";"unknown";7;"may";120;2;-1;0;"unknown";"no"
-56;"retired";"married";"secondary";"no";0;"yes";"no";"unknown";7;"may";333;4;-1;0;"unknown";"no"
-29;"technician";"married";"secondary";"no";8;"no";"no";"unknown";7;"may";113;1;-1;0;"unknown";"no"
-40;"blue-collar";"divorced";"secondary";"no";139;"yes";"no";"unknown";7;"may";91;1;-1;0;"unknown";"no"
-36;"services";"married";"secondary";"no";184;"yes";"no";"unknown";7;"may";128;3;-1;0;"unknown";"no"
-37;"blue-collar";"single";"secondary";"no";238;"yes";"no";"unknown";7;"may";200;2;-1;0;"unknown";"no"
-35;"admin.";"married";"secondary";"no";0;"no";"no";"unknown";7;"may";326;1;-1;0;"unknown";"no"
-35;"blue-collar";"married";"primary";"yes";0;"yes";"no";"unknown";7;"may";292;1;-1;0;"unknown";"no"
-47;"services";"married";"primary";"no";222;"yes";"no";"unknown";7;"may";68;1;-1;0;"unknown";"no"
-31;"services";"married";"secondary";"no";414;"yes";"no";"unknown";7;"may";215;1;-1;0;"unknown";"no"
-56;"retired";"single";"primary";"no";223;"yes";"no";"unknown";7;"may";97;1;-1;0;"unknown";"no"
-57;"technician";"married";"secondary";"no";197;"no";"no";"unknown";7;"may";32;1;-1;0;"unknown";"no"
-36;"blue-collar";"married";"secondary";"no";-251;"yes";"no";"unknown";7;"may";162;1;-1;0;"unknown";"no"
-45;"self-employed";"divorced";"secondary";"no";-139;"yes";"no";"unknown";7;"may";152;3;-1;0;"unknown";"no"
-47;"blue-collar";"married";"unknown";"no";733;"yes";"no";"unknown";7;"may";268;1;-1;0;"unknown";"no"
-29;"technician";"single";"tertiary";"no";0;"yes";"no";"unknown";7;"may";104;2;-1;0;"unknown";"no"
-57;"services";"married";"secondary";"no";1;"no";"no";"unknown";7;"may";852;1;-1;0;"unknown";"no"
-45;"blue-collar";"married";"primary";"no";97;"yes";"no";"unknown";7;"may";923;3;-1;0;"unknown";"no"
-31;"blue-collar";"single";"primary";"no";435;"yes";"no";"unknown";7;"may";159;2;-1;0;"unknown";"no"
-31;"management";"divorced";"tertiary";"no";0;"yes";"no";"unknown";7;"may";953;3;-1;0;"unknown";"no"
-37;"technician";"single";"tertiary";"no";147;"no";"no";"unknown";7;"may";416;2;-1;0;"unknown";"no"
-30;"technician";"single";"tertiary";"no";3;"yes";"no";"unknown";7;"may";174;1;-1;0;"unknown";"no"
-58;"services";"divorced";"secondary";"no";1109;"yes";"yes";"unknown";7;"may";180;1;-1;0;"unknown";"no"
-33;"services";"married";"secondary";"no";404;"yes";"no";"unknown";7;"may";139;1;-1;0;"unknown";"no"
-39;"blue-collar";"married";"primary";"no";981;"yes";"no";"unknown";7;"may";294;1;-1;0;"unknown";"no"
-33;"blue-collar";"single";"primary";"no";95;"yes";"no";"unknown";7;"may";102;1;-1;0;"unknown";"no"
-34;"services";"married";"secondary";"no";302;"yes";"no";"unknown";7;"may";124;1;-1;0;"unknown";"no"
-36;"services";"divorced";"secondary";"no";-290;"yes";"yes";"unknown";7;"may";128;1;-1;0;"unknown";"no"
-37;"services";"single";"secondary";"no";259;"yes";"no";"unknown";7;"may";130;1;-1;0;"unknown";"no"
-35;"blue-collar";"married";"secondary";"no";527;"yes";"yes";"unknown";7;"may";143;1;-1;0;"unknown";"no"
-55;"retired";"married";"secondary";"no";102;"yes";"no";"unknown";7;"may";74;1;-1;0;"unknown";"no"
-34;"management";"single";"tertiary";"no";872;"yes";"no";"unknown";7;"may";105;2;-1;0;"unknown";"no"
-40;"management";"divorced";"tertiary";"no";490;"yes";"no";"unknown";7;"may";477;2;-1;0;"unknown";"no"
-42;"blue-collar";"single";"primary";"no";19;"yes";"no";"unknown";7;"may";158;1;-1;0;"unknown";"no"
-37;"blue-collar";"married";"secondary";"no";16;"yes";"no";"unknown";7;"may";250;1;-1;0;"unknown";"no"
-42;"management";"married";"tertiary";"no";386;"yes";"no";"unknown";7;"may";168;1;-1;0;"unknown";"no"
-35;"technician";"single";"secondary";"no";539;"yes";"no";"unknown";7;"may";520;1;-1;0;"unknown";"no"
-44;"technician";"divorced";"secondary";"no";-329;"yes";"no";"unknown";7;"may";171;1;-1;0;"unknown";"no"
-30;"services";"single";"secondary";"no";-174;"yes";"no";"unknown";7;"may";113;1;-1;0;"unknown";"no"
-45;"entrepreneur";"married";"secondary";"no";68;"yes";"no";"unknown";7;"may";254;1;-1;0;"unknown";"no"
-35;"blue-collar";"single";"unknown";"yes";-532;"yes";"no";"unknown";7;"may";149;1;-1;0;"unknown";"no"
-36;"admin.";"divorced";"secondary";"no";0;"yes";"no";"unknown";7;"may";133;2;-1;0;"unknown";"no"
-49;"blue-collar";"married";"secondary";"no";64;"yes";"no";"unknown";7;"may";293;3;-1;0;"unknown";"no"
-31;"blue-collar";"single";"secondary";"no";1415;"yes";"no";"unknown";7;"may";485;1;-1;0;"unknown";"no"
-31;"technician";"single";"secondary";"no";147;"yes";"no";"unknown";7;"may";374;1;-1;0;"unknown";"no"
-39;"blue-collar";"married";"secondary";"no";72;"yes";"no";"unknown";7;"may";425;6;-1;0;"unknown";"no"
-37;"services";"single";"secondary";"no";-196;"yes";"no";"unknown";7;"may";207;1;-1;0;"unknown";"no"
-33;"blue-collar";"married";"primary";"no";716;"yes";"no";"unknown";7;"may";83;3;-1;0;"unknown";"no"
-37;"management";"married";"tertiary";"no";0;"yes";"no";"unknown";7;"may";228;1;-1;0;"unknown";"no"
-42;"services";"married";"secondary";"no";-246;"yes";"no";"unknown";7;"may";149;1;-1;0;"unknown";"no"
-56;"blue-collar";"married";"secondary";"no";-203;"yes";"no";"unknown";7;"may";139;1;-1;0;"unknown";"no"
-37;"admin.";"single";"secondary";"no";245;"yes";"yes";"unknown";7;"may";732;2;-1;0;"unknown";"yes"
-36;"services";"single";"secondary";"no";342;"yes";"no";"unknown";7;"may";142;1;-1;0;"unknown";"no"
-29;"technician";"single";"tertiary";"no";3;"yes";"no";"unknown";7;"may";359;1;-1;0;"unknown";"no"
-54;"management";"married";"tertiary";"yes";-248;"yes";"yes";"unknown";7;"may";112;1;-1;0;"unknown";"no"
-38;"blue-collar";"married";"secondary";"no";376;"yes";"no";"unknown";7;"may";1521;1;-1;0;"unknown";"no"
-43;"blue-collar";"divorced";"secondary";"no";370;"yes";"no";"unknown";7;"may";216;1;-1;0;"unknown";"no"
-47;"admin.";"single";"secondary";"no";594;"yes";"no";"unknown";7;"may";161;1;-1;0;"unknown";"no"
-47;"blue-collar";"married";"secondary";"no";387;"yes";"no";"unknown";7;"may";122;2;-1;0;"unknown";"no"
-38;"services";"married";"secondary";"no";208;"yes";"no";"unknown";7;"may";800;1;-1;0;"unknown";"no"
-40;"blue-collar";"married";"secondary";"no";563;"yes";"no";"unknown";7;"may";615;1;-1;0;"unknown";"no"
-33;"services";"divorced";"secondary";"no";392;"yes";"yes";"unknown";7;"may";254;1;-1;0;"unknown";"no"
-33;"retired";"married";"secondary";"no";165;"no";"no";"unknown";7;"may";111;1;-1;0;"unknown";"no"
-53;"admin.";"divorced";"unknown";"no";236;"yes";"no";"unknown";7;"may";354;1;-1;0;"unknown";"no"
-37;"services";"married";"primary";"no";52;"yes";"no";"unknown";7;"may";359;1;-1;0;"unknown";"no"
-40;"management";"single";"tertiary";"no";1265;"yes";"no";"unknown";7;"may";97;1;-1;0;"unknown";"no"
-37;"blue-collar";"married";"primary";"no";693;"yes";"no";"unknown";7;"may";327;3;-1;0;"unknown";"no"
-35;"technician";"married";"secondary";"no";118;"yes";"no";"unknown";7;"may";236;1;-1;0;"unknown";"no"
-49;"blue-collar";"married";"primary";"no";3659;"yes";"no";"unknown";7;"may";160;1;-1;0;"unknown";"no"
-26;"blue-collar";"single";"secondary";"no";24;"yes";"no";"unknown";7;"may";180;1;-1;0;"unknown";"no"
-38;"management";"single";"tertiary";"no";673;"yes";"no";"unknown";7;"may";184;1;-1;0;"unknown";"no"
-52;"self-employed";"married";"secondary";"no";273;"no";"no";"unknown";7;"may";227;1;-1;0;"unknown";"no"
-33;"services";"divorced";"secondary";"no";327;"yes";"no";"unknown";7;"may";109;1;-1;0;"unknown";"no"
-31;"admin.";"single";"secondary";"no";299;"yes";"no";"unknown";7;"may";492;2;-1;0;"unknown";"no"
-32;"blue-collar";"married";"secondary";"no";0;"yes";"no";"unknown";7;"may";298;1;-1;0;"unknown";"no"
-35;"blue-collar";"single";"primary";"no";109;"yes";"no";"unknown";7;"may";83;2;-1;0;"unknown";"no"
-55;"management";"divorced";"tertiary";"no";552;"no";"no";"unknown";7;"may";241;2;-1;0;"unknown";"no"
-32;"blue-collar";"divorced";"primary";"no";473;"yes";"no";"unknown";7;"may";204;2;-1;0;"unknown";"no"
-37;"unknown";"single";"unknown";"no";414;"yes";"no";"unknown";7;"may";131;1;-1;0;"unknown";"no"
-45;"blue-collar";"married";"secondary";"no";154

<TRUNCATED>
r***@apache.org
2018-06-28 14:55:16 UTC
Permalink
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/bin/resources/country.txt
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/bin/resources/country.txt b/community/mahout-mr/examples/bin/resources/country.txt
deleted file mode 100644
index 6a22091..0000000
--- a/community/mahout-mr/examples/bin/resources/country.txt
+++ /dev/null
@@ -1,229 +0,0 @@
-Afghanistan
-Albania
-Algeria
-American Samoa
-Andorra
-Angola
-Anguilla
-Antigua and Barbuda
-Argentina
-Armenia
-Aruba
-Australia
-Austria
-Azerbaijan
-Bahamas
-Bangladesh
-Barbados
-Belarus
-Belgium
-Belize
-Benin
-Bermuda
-Bhutan
-Bolivia
-Bosnia and Herzegovina
-Botswana
-Bouvet Island
-Brazil
-British Indian Ocean Territory
-Brunei Darussalam
-Bulgaria
-Burkina Faso
-Burundi
-Cambodia
-Cameroon
-Canada
-Cape Verde
-Cayman Islands
-Central African Republic
-Chad
-Chile
-China
-Christmas Island
-Cocos Islands
-Colombia
-Comoros
-Congo
-Cook Islands
-Costa Rica
-Croatia
-C�te d'Ivoire
-Cuba
-Cyprus
-Czech Republic
-Djibouti
-Dominica
-Dominican Republic
-Ecuador
-Egypt
-El Salvador
-Equatorial Guinea
-Eritrea
-Estonia
-Ethiopia
-Falkland Islands
-Faroe Islands
-Fiji
-Finland
-France
-French Guiana
-French Polynesia
-French Southern Territories
-Gabon
-Georgia
-Germany
-Ghana
-Gibraltar
-Greece
-Greenland
-Grenada
-Guadeloupe
-Guam
-Guatemala
-Guernsey
-Guinea
-Guinea-Bissau
-Guyana
-Haiti
-Honduras
-Hong Kong
-Hungary
-Iceland
-India
-Indonesia
-Iran
-Iraq
-Ireland
-Isle of Man
-Israel
-Italy
-Japan
-Jersey
-Jordan
-Kazakhstan
-Kenya
-Kiribati
-Korea
-Kuwait
-Kyrgyzstan
-Latvia
-Lebanon
-Lesotho
-Liberia
-Liechtenstein
-Lithuania
-Luxembourg
-Macedonia
-Madagascar
-Malawi
-Malaysia
-Maldives
-Mali
-Malta
-Marshall Islands
-Martinique
-Mauritania
-Mauritius
-Mayotte
-Mexico
-Micronesia
-Moldova
-Monaco
-Mongolia
-Montenegro
-Montserrat
-Morocco
-Mozambique
-Myanmar
-Namibia
-Nauru
-Nepal
-Netherlands
-Netherlands Antilles
-New Caledonia
-New Zealand
-Nicaragua
-Niger
-Nigeria
-Niue
-Norfolk Island
-Northern Mariana Islands
-Norway
-Oman
-Pakistan
-Palau
-Palestinian Territory
-Panama
-Papua New Guinea
-Paraguay
-Peru
-Philippines
-Pitcairn
-Poland
-Portugal
-Puerto Rico
-Qatar
-R�union
-Russian Federation
-Rwanda
-Saint Barth�lemy
-Saint Helena
-Saint Kitts and Nevis
-Saint Lucia
-Saint Martin
-Saint Pierre and Miquelon
-Saint Vincent and the Grenadines
-Samoa
-San Marino
-Sao Tome and Principe
-Saudi Arabia
-Senegal
-Serbia
-Seychelles
-Sierra Leone
-Singapore
-Slovakia
-Slovenia
-Solomon Islands
-Somalia
-South Africa
-South Georgia and the South Sandwich Islands
-Spain
-Sri Lanka
-Sudan
-Suriname
-Svalbard and Jan Mayen
-Swaziland
-Sweden
-Switzerland
-Syrian Arab Republic
-Taiwan
-Tanzania
-Thailand
-Timor-Leste
-Togo
-Tokelau
-Tonga
-Trinidad and Tobago
-Tunisia
-Turkey
-Turkmenistan
-Turks and Caicos Islands
-Tuvalu
-Ukraine
-United Arab Emirates
-United Kingdom
-United States
-United States Minor Outlying Islands
-Uruguay
-Uzbekistan
-Vanuatu
-Vatican
-Venezuela
-Vietnam
-Virgin Islands
-Wallis and Futuna
-Yemen
-Zambia
-Zimbabwe

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/bin/resources/country10.txt
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/bin/resources/country10.txt b/community/mahout-mr/examples/bin/resources/country10.txt
deleted file mode 100644
index 97a63e1..0000000
--- a/community/mahout-mr/examples/bin/resources/country10.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-Australia
-Austria
-Bahamas
-Canada
-Colombia
-Cuba
-Panama
-Pakistan
-United Kingdom
-Vietnam

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/bin/resources/country2.txt
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/bin/resources/country2.txt b/community/mahout-mr/examples/bin/resources/country2.txt
deleted file mode 100644
index f4b4f61..0000000
--- a/community/mahout-mr/examples/bin/resources/country2.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-United States
-United Kingdom

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/bin/resources/donut-test.csv
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/bin/resources/donut-test.csv b/community/mahout-mr/examples/bin/resources/donut-test.csv
deleted file mode 100644
index 46ea564..0000000
--- a/community/mahout-mr/examples/bin/resources/donut-test.csv
+++ /dev/null
@@ -1,41 +0,0 @@
-"x","y","shape","color","xx","xy","yy","c","a","b"
-0.802415437065065,0.0978854028508067,21,2,0.643870533640319,0.07854475831082,0.00958155209126472,0.503141377562721,0.808363832523192,0.220502180491382
-0.97073650965467,0.989339149091393,23,2,0.942329371176533,0.96038763245370,0.978791951924881,0.67900343471543,1.38604520961670,0.989771844311643
-0.566630310611799,0.369259539060295,25,1,0.321069908904024,0.209233647314105,0.136352607187021,0.146740132271139,0.676330182744379,0.569352171215186
-0.377948862500489,0.500907538458705,24,1,0.142845342665413,0.189317434378387,0.250908362084759,0.122054511555201,0.62749797190921,0.79865886318828
-0.0133881184738129,0.269793515326455,25,2,0.000179241716268851,0.00361202754665705,0.0727885409122062,0.538317888266967,0.270125494221621,1.02283505301727
-0.395229484187439,0.385281964903697,25,1,0.156206345171069,0.152274792255611,0.148442192480054,0.155361155247979,0.551949760078871,0.717070128562224
-0.757145672803745,0.416044564917684,21,1,0.573269569845435,0.315006342020941,0.173093079997545,0.270503996498299,0.863922826323613,0.481737796145881
-0.589166145538911,0.971624446567148,24,2,0.347116747049177,0.572448230095344,0.944054065166917,0.479979395505718,1.13629697360157,1.05491161769044
-0.843438957352191,0.218833807157353,25,2,0.711389274779351,0.184572958142208,0.0478882351549814,0.443852166182378,0.871365313708512,0.269071728782402
-0.628562391968444,0.801476288354024,25,2,0.395090680597092,0.503777852913796,0.642364240793743,0.327744170151609,1.01855531091386,0.8833629703887
-0.262267543468624,0.247060472844169,22,2,0.0687842643570668,0.0647959433010369,0.0610388772419841,0.347124077652729,0.360309785599907,0.778002605819416
-0.738417695043609,0.562460686312988,21,1,0.545260692353516,0.415330923539883,0.316362023647678,0.246463657857698,0.928236347058869,0.620312280963368
-0.498857178725302,0.164454092038795,21,1,0.248858484765768,0.0820391043843046,0.0270451483883046,0.335547854098302,0.525265297877247,0.527436513434051
-0.499293045606464,0.733599063009024,25,1,0.249293545390979,0.366280910423824,0.538167585247717,0.233600132755117,0.88739006679064,0.888186376514393
-0.553942533675581,0.548312899889424,24,1,0.306852330614922,0.303733837011753,0.30064703618515,0.0724150069741539,0.779422457207946,0.706833997094728
-0.661088703200221,0.98143746308051,24,2,0.43703827349895,0.64881721974001,0.963219493937908,0.507672730364875,1.1833248782295,1.03830648704340
-0.492181566543877,0.376017479225993,23,1,0.242242694445585,0.185068871973329,0.141389144683470,0.124228794404457,0.619380205632255,0.63187712891139
-0.991064163157716,0.216620326042175,21,2,0.982208175495505,0.21468464215194,0.0469243656546183,0.566963889458783,1.01446170018888,0.21680455446021
-0.601602173643187,0.343355831922963,24,1,0.361925175332207,0.206563614817919,0.117893227315510,0.186709392055052,0.692689254029335,0.52594111396747
-0.0397100185509771,0.0602901463862509,25,2,0.00157688557331895,0.00239412283143915,0.00363490175127556,0.636562347604197,0.0721927096360464,0.962180726382856
-0.158290433697402,0.630195834673941,23,2,0.0250558614001118,0.0997539719848347,0.397146790040385,0.365672507948237,0.649771230080632,1.05148551299849
-0.967184047214687,0.497705311980098,25,2,0.935444981186582,0.48137263796116,0.247710577573207,0.467189682639721,1.08772954302059,0.498785990511377
-0.538070349488407,0.0130743277259171,24,2,0.289519700998577,0.00703490808881019,0.000170938045484685,0.488411672495383,0.538229169633216,0.462114639529248
-0.758642012253404,0.673675778554752,25,2,0.575537702755893,0.511078748249156,0.453839054611352,0.311542880770993,1.01458206044028,0.715606548922268
-0.986405614530668,0.981674374546856,21,2,0.972996036377624,0.9683291146939,0.96368457764196,0.684544100071034,1.39164672744903,0.981768498658543
-0.51937106740661,0.462004136526957,23,1,0.269746305659081,0.239951581534275,0.213447822168019,0.0426488439882434,0.695121664046734,0.666672328069706
-0.534244359936565,0.692785677267238,21,1,0.28541703612403,0.370116840724856,0.479951994626626,0.195803456422130,0.87485371963012,0.83479357381183
-0.0795328004751354,0.536029864801094,22,2,0.00632546635141770,0.0426319562859392,0.287328015958679,0.422008076977050,0.541898036820671,1.06517035321108
-0.330987347057089,0.804738595616072,23,2,0.10955262391189,0.266358292837412,0.647604207274128,0.348469350894533,0.870147591610767,1.04650950166343
-0.9804020607844,0.74571731640026,25,2,0.961188200790297,0.731102793761427,0.556094315979205,0.539595348001485,1.23178022259229,0.745974795285138
-0.362560331821442,0.805498170899227,21,2,0.131449994210474,0.292041684122788,0.648827303322001,0.334990738397057,0.883333061496328,1.02720817456326
-0.47635925677605,0.961423690896481,21,2,0.226918141516230,0.457983074842334,0.924335513417013,0.462028903057712,1.07296488988841,1.09477629741475
-0.850710266502574,0.635807712096721,24,2,0.723707957532881,0.540888148202193,0.404251446761667,0.376086992190972,1.06205433208219,0.65309943445803
-0.136131341336295,0.714137809583917,25,2,0.0185317420940189,0.0972165379176223,0.509992811077315,0.422203034393551,0.726996941651981,1.12083088398685
-0.930458213202655,0.865616530412808,24,2,0.865752486516278,0.805420010206583,0.749291977723908,0.564774043865972,1.27084399681479,0.868405457050378
-0.374636142514646,0.197784703457728,21,2,0.140352239278254,0.0740972983518064,0.0391187889218614,0.327185241457712,0.423640210792266,0.655895375171089
-0.482126326300204,0.841961156809703,22,1,0.232445794511731,0.405931639420132,0.708898589576332,0.342427950053959,0.970229036922758,0.988479504839456
-0.660344187868759,0.746531683253124,24,2,0.436054446452051,0.492967858096082,0.557309554100743,0.294088642131774,0.996676477375078,0.82016804669243
-0.0772640188224614,0.437956433976069,22,2,0.00596972860459766,0.0338382741581451,0.191805838061035,0.427264688298837,0.444719649515999,1.02139489377063
-0.998469967395067,0.464829172473401,25,2,0.996942275789907,0.464117968683793,0.216066159582307,0.499709210945471,1.10136662168971,0.464831690595724

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/bin/resources/donut.csv
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/bin/resources/donut.csv b/community/mahout-mr/examples/bin/resources/donut.csv
deleted file mode 100644
index 33ba3b7..0000000
--- a/community/mahout-mr/examples/bin/resources/donut.csv
+++ /dev/null
@@ -1,41 +0,0 @@
-"x","y","shape","color","k","k0","xx","xy","yy","a","b","c","bias"
-0.923307513352484,0.0135197141207755,21,2,4,8,0.852496764213146,0.0124828536260896,0.000182782669907495,0.923406490600458,0.0778750292332978,0.644866125183976,1
-0.711011884035543,0.909141522599384,22,2,3,9,0.505537899239772,0.64641042683833,0.826538308114327,1.15415605849213,0.953966686673604,0.46035073663368,1
-0.75118898646906,0.836567111080512,23,2,3,9,0.564284893392414,0.62842000028592,0.699844531341594,1.12433510339845,0.872783737128441,0.419968245447719,1
-0.308209649519995,0.418023289414123,24,1,5,1,0.094993188057238,0.128838811521522,0.174743470492603,0.519361780024138,0.808280495564412,0.208575453051705,1
-0.849057961953804,0.500220163026825,25,1,5,2,0.720899422757147,0.424715912147755,0.250220211498583,0.985454024425153,0.52249756970547,0.349058031386046,1
-0.0738831346388906,0.486534863477573,21,2,6,1,0.00545871758406844,0.0359467208248278,0.236716173379140,0.492112681164801,1.04613986717142,0.42632955896436,1
-0.612888508243486,0.0204555552918464,22,2,4,10,0.375632323536926,0.0125369747681119,0.000418429742297785,0.613229772009826,0.387651566219268,0.492652707029903,1
-0.207169560948387,0.932857288978994,23,2,1,4,0.0429192269835473,0.193259634985281,0.870222721601238,0.955584610897845,1.22425602987611,0.522604151014326,1
-0.309267645236105,0.506309477845207,24,1,5,1,0.0956464763898851,0.156585139973909,0.256349287355886,0.593292308854389,0.856423069092351,0.190836685845410,1
-0.78758287569508,0.171928803203627,25,2,4,10,0.620286786088131,0.135408181241926,0.0295595133710317,0.806130448165285,0.273277419610556,0.436273561610666,1
-0.930236018029973,0.0790199618786573,21,2,4,8,0.86533904924026,0.0735072146828825,0.00624415437530446,0.93358620577618,0.105409523078414,0.601936228937031,1
-0.238834470743313,0.623727766098455,22,1,5,1,0.0570419044152386,0.148967690904034,0.389036326202168,0.667890882268509,0.984077887735915,0.288991338582386,1
-0.83537525916472,0.802311758277938,23,2,3,7,0.697851823624524,0.670231393002335,0.643704157471036,1.15825557675997,0.819027144096042,0.451518508649315,1
-0.656760312616825,0.320640653371811,24,1,5,3,0.43133410822855,0.210584055746134,0.102810428594702,0.730851925374252,0.469706197095164,0.238209090579297,1
-0.180789119331166,0.114329558331519,25,2,2,5,0.0326847056685386,0.0206695401642766,0.0130712479082803,0.213906413126907,0.82715035810576,0.500636870310341,1
-0.990028728265315,0.061085847672075,21,2,4,8,0.980156882790638,0.0604767440857932,0.00373148078581595,0.991911469626425,0.06189432159595,0.657855445853466,1
-0.751934139290825,0.972332585137337,22,2,3,9,0.565404949831033,0.731130065509666,0.945430656119858,1.22916052895905,1.00347761677540,0.535321288127727,1
-0.136412925552577,0.552212274167687,23,2,6,1,0.0186084862578129,0.0753288918452558,0.304938395741448,0.5688118159807,1.02504684326820,0.3673168690368,1
-0.5729476721026,0.0981996888294816,24,2,4,10,0.328269034967789,0.0562632831160512,0.0096431788862070,0.581302170866406,0.43819729534628,0.408368525870829,1
-0.446335297077894,0.339370004367083,25,1,5,3,0.199215197417612,0.151472811718508,0.115171999864114,0.560702414192882,0.649397107420365,0.169357302283512,1
-0.922843366628513,0.912627586396411,21,2,3,7,0.851639879330248,0.842212314308118,0.832889111451739,1.29789405992245,0.915883320912091,0.590811338548155,1
-0.166969822719693,0.398156099021435,22,2,6,1,0.0278789216990458,0.0664800532683736,0.158528279187967,0.431749002184154,0.923291695753637,0.348254618269284,1
-0.350683249300346,0.84422400011681,23,2,1,6,0.122978741339848,0.296055215498298,0.712714162373228,0.914162405545687,1.06504760696993,0.375214144584023,1
-0.47748578293249,0.792779305484146,24,1,5,6,0.227992672902653,0.378540847371773,0.628499027203925,0.9254683679665,0.949484141121692,0.29364368150863,1
-0.384564548265189,0.153326370986179,25,2,2,5,0.147889891782409,0.0589638865954405,0.0235089760397912,0.414003463538894,0.634247405427742,0.365387395199715,1
-0.563622857443988,0.467359990812838,21,1,5,3,0.317670725433326,0.263414773476928,0.218425361012576,0.73218582781006,0.639414084578942,0.071506910079209,1
-0.343304847599939,0.854578266385943,22,2,1,6,0.117858218385617,0.293380861503846,0.730304013379203,0.920957236664559,1.07775346743350,0.387658506651072,1
-0.666085948701948,0.710089378990233,23,1,5,2,0.443670491058174,0.472980557667886,0.504226926154735,0.973600234805286,0.784681795257806,0.267809801016930,1
-0.190568120684475,0.0772022884339094,24,2,2,5,0.0363162086212125,0.0147122950193909,0.00596019333943254,0.205612261211838,0.813105258002736,0.523933195018469,1
-0.353534662164748,0.427994541125372,25,1,5,1,0.124986757351942,0.151310905505115,0.183179327233118,0.555127088678854,0.775304301713569,0.163208092002022,1
-0.127048352966085,0.927507144864649,21,2,1,4,0.0161412839913949,0.117838255119330,0.860269503774972,0.936168140755905,1.27370093893119,0.567322915045421,1
-0.960906301159412,0.891004979610443,22,2,3,7,0.923340919607862,0.856172299272088,0.793889873690606,1.31043152942016,0.891862204031343,0.604416671286136,1
-0.306814440060407,0.902291874401271,23,2,1,6,0.094135100629581,0.276836176215481,0.81413062661056,0.953029761990747,1.13782109627099,0.446272800849954,1
-0.087350245565176,0.671402548439801,24,2,6,4,0.00763006540029655,0.0586471774793016,0.450781382051459,0.677060889028273,1.13300968942079,0.446831795474291,1
-0.27015240653418,0.371201378758997,25,1,5,1,0.0729823227562089,0.100280945780549,0.137790463592580,0.459099974241765,0.81882108746687,0.263474858488646,1
-0.871842501685023,0.569787061074749,21,2,3,2,0.7601093477444,0.496764576755166,0.324657294968199,1.04152131169391,0.584021951079369,0.378334613738721,1
-0.686449621338397,0.169308491749689,22,2,4,10,0.471213082635629,0.116221750050949,0.0286653653785545,0.707020825728764,0.356341416814533,0.379631841296403,1
-0.67132937326096,0.571220482233912,23,1,5,2,0.450683127402953,0.383477088331915,0.326292839323543,0.881462402332905,0.659027480614106,0.185542747720368,1
-0.548616112209857,0.405350996181369,24,1,5,3,0.300979638576258,0.222382087605415,0.164309430105228,0.682121007359754,0.606676886210257,0.106404700508298,1
-0.677980388281867,0.993355110753328,25,2,3,9,0.459657406894831,0.673475283690318,0.986754376059756,1.20266860895036,1.04424662144096,0.524477152905055,1

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/bin/resources/test-data.csv
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/bin/resources/test-data.csv b/community/mahout-mr/examples/bin/resources/test-data.csv
deleted file mode 100644
index ab683cd..0000000
--- a/community/mahout-mr/examples/bin/resources/test-data.csv
+++ /dev/null
@@ -1,61 +0,0 @@
-"V1","V2","V3","V4","V5","V6","V7","V8","y"
-1,-0.212887381184450,-0.955959589855826,-0.00326541907490505,0.0560086232868742,0.091264583618544,0.0172194710825328,-0.0237399208336878,1
-1,3.14702017427074,2.12881054220556,-0.00566925018709358,-0.055626039510634,-0.0630510476335515,-0.00155145331201058,0.108559859662683,0
-1,-2.16541417186635,-2.71847685293678,-0.00833554984263851,0.0433655514274994,-0.102555485096075,-0.156155728366877,-0.0241458595902909,1
-1,-4.33686585982661,-2.6857484867589,-0.0115524101901378,0.122387581992154,0.081766215557828,-0.0206167352421607,-0.0424490760296281,1
-1,2.34100936064648,2.10958510331364,-0.0129315842415535,0.173866353524092,-0.0299915285951044,0.108136400830407,-0.0063355720943443,0
-1,1.30317270786224,3.37038662087804,-0.0230504278644102,-0.131884713919903,0.086455020204179,0.17337860146005,-0.0524355492943794,0
-1,1.94943481762617,3.54806480367192,-0.029538920288902,-0.0720379027720258,0.214306548234308,-0.082665692089578,0.226607475768828,0
-1,3.14635496849369,1.76134258264267,-0.0318247859223975,-0.187198080297378,-0.08576487890296,0.153638925055934,-0.0691201521844938,0
-1,-1.26105438936697,-1.95583819596755,-0.0367826492102569,-0.0936093811581598,-0.0317225362744449,-0.0840334569992295,-0.0627566339884115,1
-1,2.40442001058194,3.23077413487565,-0.0452264569747572,0.0371989606630366,-0.17352653795031,0.102543062447842,-0.0551882772900301,0
-1,-2.20940227045733,-0.175769402031962,-0.0465958462590872,0.130789407148096,-0.140283147466875,0.0708851428212228,0.0605244763586474,1
-1,-1.64710385829030,-2.57691366099069,-0.0553070134425288,-0.0349011715152424,-0.0826092377112715,0.106766133325393,-0.0585587032435851,1
-1,-2.6523724984616,-4.16903830585265,-0.0568310036349303,-0.0291979248790545,-0.255996825268056,0.0401827924643623,0.0179311252387879,1
-1,2.34337447158977,0.28996735916551,-0.0625800583342644,0.0899232083837452,0.0255207970332586,-0.0343458209061299,0.0755898049986344,0
-1,3.67556867120403,1.36097809464341,-0.0956707962851342,0.0537771695881714,-0.0373171704803031,0.0463473815328367,-0.228499359561800,0
-1,1.96533061882493,2.92646586187099,-0.103334098736041,-0.0194013528907574,0.0253359438067293,0.00748464018133427,-0.239745502177878,0
-1,-1.95041601303593,-0.860607985906108,-0.103721968898869,-0.00972933741506002,0.0227857854969761,-0.0287381002832544,-0.130156656165122,1
-1,-1.51543545229533,-1.35683836829949,-0.106483722717291,0.103877046729912,0.00840497101030744,0.0258430051020969,0.168907472637671,1
-1,1.45074382041585,1.88231080047069,-0.107681637419817,-0.00626324733854461,-0.144385489192821,0.00088239451623517,-0.00299885969569744,0
-1,3.87956616310254,4.31276421460554,-0.129963535661731,-0.0640782960295875,-0.0324909886960640,0.0428280701443882,0.0329254937199428,0
-1,-2.88187391546093,-3.16731558128991,-0.136390769151814,-0.155408895734766,0.105626409419800,-0.0918345772196075,0.197828194781600,1
-1,-2.65024496288248,-1.81147577507541,-0.145438998990911,0.0691687502404964,0.0749439097959056,-0.0674149410216342,0.123896965825847,1
-1,-1.37426198993006,-2.08894064826135,-0.153236566384176,0.0213513951854753,-0.134553043562400,0.00287304090325258,0.0122158739075685,1
-1,1.65698424179346,2.49004336804714,-0.153862461770005,0.105220938080375,-0.0946233303225818,-0.122426312548592,-0.00538234276442917,0
-1,2.93315586503758,2.75229115279104,-0.168877592929163,-0.0349207806558679,0.0189964813847077,0.202397029441612,0.0426299706123943,0
-1,-3.84306960373604,-2.35606387141237,-0.179511886850707,-0.0916819865200809,0.0265829433229566,0.101658708455140,-0.0855390303406673,1
-1,2.28101644492271,1.37963780647481,-0.180898801743387,-0.0789829066843624,-0.0779025366072777,0.0442621459868237,-0.136195159617836,0
-1,1.70008372335953,2.71018350574622,-0.188985514267118,-0.195856534813112,-0.106263419324547,-0.0311178988395261,-0.121173036989233,0
-1,-2.05613043162767,-1.73770126734937,0.00630625444849072,-0.134595964087825,0.0708994966210059,0.0739139562742148,-0.00416084523004362,1
-1,2.39375626983328,3.2468518382106,0.00951905535238045,-0.140380515724865,0.0630970962358967,0.00183192220061040,-0.0773483294293499,0
-1,4.26863682432937,3.49421800345979,0.0109175198048448,-0.109995560295421,-0.111585866731122,0.154763193427948,-0.0186987535307691,0
-1,1.54495296452702,3.17243560853872,0.0117478311845783,0.115838636637105,-0.1715332868224,0.0927292648278796,-0.0885962242970987,0
-1,2.16883227993245,1.63879588167162,0.0158863105366749,-0.00488771308802354,0.0280782748001184,0.131946735985038,0.066416828384239,0
-1,1.86427271422921,3.32026821853873,0.0162473257475520,0.0355005599857545,-0.0988825269654524,0.0527023072810735,0.100841323212596,0
-1,-3.03828333997027,-1.43214405751321,0.0247204684728272,0.146197859364444,0.0141171187314724,-0.201738256450160,0.044002672456105,1
-1,2.08595761680696,0.225336429607513,0.0335964287149376,0.0576493862055925,0.121452048491972,0.0640240734436852,0.224720096669846,0
-1,-1.85256114614442,-2.22817393781734,0.0346230650580488,0.160185441442375,0.0114059982858295,0.00496408500928602,-0.094156048483371,1
-1,2.33572915427688,1.03334367238243,0.0357824515834720,-0.172284120406131,0.0329286256184980,-0.101030665525296,-0.00238851979619332,0
-1,-2.00334039609229,-2.98875026257892,0.0375804284421083,0.142856636546252,-0.0862220203147005,-0.0441603903572752,0.0147126239348866,1
-1,2.38346139581192,1.21051372282823,0.0405425233313353,-0.145245065311593,-0.0216697981922324,-0.0128934036902430,-0.0325085994141851,0
-1,-1.15629168023471,-1.37784639006639,0.0429948703549178,-0.00491267793152886,0.0263522850749959,-0.0442602193050815,0.0582704866256344,1
-1,2.13230915550664,1.32833684701498,0.0434112538719301,-0.0296522957829338,0.00247091583877657,-0.123872403365319,-0.136549696313901,0
-1,-1.88291252343724,-1.99980946454726,0.0472833199907535,-0.0365284873908706,-0.0209054390489622,-0.0891896486647233,0.0542966824787834,1
-1,-1.34787394136153,-2.57763619051754,0.0493154843443071,0.0384664637019124,-0.00780509859650452,-0.118550134827935,0.00573215142098708,1
-1,-1.81748193199251,-2.72113041015796,0.0551479875680516,-0.255723061179778,-0.217672946803948,0.145106553357089,0.0632886151091758,1
-1,-3.13049595715861,-0.0285946551309455,0.0724437318718333,-0.0360911974267016,-0.121364676014540,0.038351368519738,-0.0125375424386282,1
-1,-2.3836883021805,-1.40162632998805,0.0746620557343183,0.069222624188286,0.04657285528431,0.0932835769596473,0.00836816351062604,1
-1,-2.43800450243598,-0.965440038635416,0.0763675021411913,-0.122575769653323,0.045866930905471,-0.0493852614669876,0.128116802512532,1
-1,1.09024638837653,2.21814920469686,0.0769910502309598,-0.270152593833931,-0.252735856082821,0.0661674666715274,-0.000429289775969046,0
-1,3.17642151475607,1.18015379683312,0.0776648965451875,-0.117234850817615,0.0759455286430382,0.119280079276134,0.117056969569811,0
-1,-3.5501372839931,-4.02435741321994,0.0833451415432366,-0.0185864612285970,0.0553371588028254,0.0269699189958747,-0.0930023774668385,1
-1,-2.85922019599943,-2.07644295605507,0.0903467736346066,0.124804691516462,0.0673015037344841,0.0234043567104492,0.0866115903248345,1
-1,0.513249476607372,5.0165612245778,0.0934321220365115,-0.0387550539552360,0.070129320868753,0.0635055975927393,-0.00773489793089484,0
-1,1.30094323285406,2.74698316868320,0.094239413405751,-0.105600040230387,-0.0134676903839459,0.00834379403909127,0.0978349326557826,0
-1,1.62511731278249,3.01296963021698,0.104352029985773,-0.0065839083200722,0.068460830526483,-0.1202220553,0.121998460927858,0
-1,1.82917662184333,2.89388269168932,0.110781239485760,-0.262387884050666,-0.00517657837760664,-0.0224028641246511,-0.108606003593092,0
-1,-3.17279743572930,-2.86698187406046,0.110873139279243,-0.093614374710967,0.0925974010859032,-0.00747619041107016,-0.066394213442664,1
-1,-3.20104938765970,-1.68043245593876,0.123227179211642,-0.00179275501686146,-0.175893752209014,-0.0835732816974749,0.0560957582079696,1
-1,-1.89923900052239,-2.92427973445236,0.147975477003611,0.00819675018680998,0.00470753628896422,-0.0122227288860826,0.209903875101594,1
-1,0.148491843864120,-1.54734877494689,0.162479731968606,0.112962938668545,-0.0100535803565242,0.0422099301034027,0.0752974779385111,1

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/bin/set-dfs-commands.sh
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/bin/set-dfs-commands.sh b/community/mahout-mr/examples/bin/set-dfs-commands.sh
deleted file mode 100755
index 0ee5fe1..0000000
--- a/community/mahout-mr/examples/bin/set-dfs-commands.sh
+++ /dev/null
@@ -1,54 +0,0 @@
-#!/bin/bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-#
-# Requires $HADOOP_HOME to be set.
-#
-# Figures out the major version of Hadoop we're using and sets commands
-# for dfs commands
-#
-# Run by each example script.
-
-# Find a hadoop shell
-if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
- HADOOP="${HADOOP_HOME}/bin/hadoop"
- if [ ! -e $HADOOP ]; then
- echo "Can't find hadoop in $HADOOP, exiting"
- exit 1
- fi
-fi
-
-# Check Hadoop version
-v=`${HADOOP_HOME}/bin/hadoop version | egrep "Hadoop [0-9]+.[0-9]+.[0-9]+" | cut -f 2 -d ' ' | cut -f 1 -d '.'`
-
-if [ $v -eq "1" -o $v -eq "0" ]
-then
- echo "Discovered Hadoop v0 or v1."
- export DFS="${HADOOP_HOME}/bin/hadoop dfs"
- export DFSRM="$DFS -rmr -skipTrash"
-elif [ $v -eq "2" ]
-then
- echo "Discovered Hadoop v2."
- export DFS="${HADOOP_HOME}/bin/hdfs dfs"
- export DFSRM="$DFS -rm -r -skipTrash"
-else
- echo "Can't determine Hadoop version."
- exit 1
-fi
-echo "Setting dfs command to $DFS, dfs rm to $DFSRM."
-
-export HVERSION=$v

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/pom.xml
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/pom.xml b/community/mahout-mr/examples/pom.xml
deleted file mode 100644
index 28a5795..0000000
--- a/community/mahout-mr/examples/pom.xml
+++ /dev/null
@@ -1,199 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
- <modelVersion>4.0.0</modelVersion>
-
- <parent>
- <groupId>org.apache.mahout</groupId>
- <artifactId>mahout-mr</artifactId>
- <version>0.14.0-SNAPSHOT</version>
- <relativePath>../pom.xml</relativePath>
- </parent>
-
- <artifactId>mr-examples</artifactId>
- <name>Mahout Examples</name>
- <description>Scalable machine learning library examples</description>
-
- <packaging>jar</packaging>
- <properties>
- <mahout.skip.example>false</mahout.skip.example>
- </properties>
- <build>
- <plugins>
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-dependency-plugin</artifactId>
- <executions>
- <execution>
- <id>copy-dependencies</id>
- <phase>package</phase>
- <goals>
- <goal>copy-dependencies</goal>
- </goals>
- <configuration>
- <!-- configure the plugin here -->
- </configuration>
- </execution>
- </executions>
- </plugin>
-
- <!-- create examples hadoop job jar -->
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-assembly-plugin</artifactId>
- <executions>
- <execution>
- <id>job</id>
- <phase>package</phase>
- <goals>
- <goal>single</goal>
- </goals>
- <configuration>
- <skipAssembly>${mahout.skip.example}</skipAssembly>
- <descriptors>
- <descriptor>src/main/assembly/job.xml</descriptor>
- </descriptors>
- </configuration>
- </execution>
- </executions>
- </plugin>
-
-
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-remote-resources-plugin</artifactId>
- <configuration>
- <appendedResourcesDirectory>../src/main/appended-resources</appendedResourcesDirectory>
- <resourceBundles>
- <resourceBundle>org.apache:apache-jar-resource-bundle:1.4</resourceBundle>
- </resourceBundles>
- <supplementalModels>
- <supplementalModel>supplemental-models.xml</supplementalModel>
- </supplementalModels>
- </configuration>
- </plugin>
-
- <plugin>
- <artifactId>maven-source-plugin</artifactId>
- </plugin>
-
- <plugin>
- <groupId>org.mortbay.jetty</groupId>
- <artifactId>maven-jetty-plugin</artifactId>
- <version>6.1.26</version>
- </plugin>
- </plugins>
-
- </build>
-
- <dependencies>
-
- <!-- our modules -->
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>mahout-hdfs</artifactId>
- </dependency>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>mahout-mr</artifactId>
- </dependency>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>mahout-hdfs</artifactId>
- <type>test-jar</type>
- <scope>test</scope>
- </dependency>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>mahout-mr</artifactId>
- <type>test-jar</type>
- <scope>test</scope>
- </dependency>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>mahout-math</artifactId>
- </dependency>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>mahout-math</artifactId>
- <type>test-jar</type>
- <scope>test</scope>
- </dependency>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>mahout-integration</artifactId>
- </dependency>
-
- <dependency>
- <groupId>org.apache.lucene</groupId>
- <artifactId>lucene-benchmark</artifactId>
- </dependency>
- <dependency>
- <groupId>org.apache.lucene</groupId>
- <artifactId>lucene-analyzers-common</artifactId>
- </dependency>
-
- <dependency>
- <groupId>com.carrotsearch.randomizedtesting</groupId>
- <artifactId>randomizedtesting-runner</artifactId>
- </dependency>
-
- <dependency>
- <groupId>org.easymock</groupId>
- <artifactId>easymock</artifactId>
- </dependency>
-
- <dependency>
- <groupId>junit</groupId>
- <artifactId>junit</artifactId>
- </dependency>
-
- <dependency>
- <groupId>org.slf4j</groupId>
- <artifactId>slf4j-api</artifactId>
- </dependency>
- <dependency>
- <groupId>org.slf4j</groupId>
- <artifactId>slf4j-log4j12</artifactId>
- </dependency>
- <dependency>
- <groupId>org.slf4j</groupId>
- <artifactId>jcl-over-slf4j</artifactId>
- </dependency>
- <dependency>
- <groupId>commons-logging</groupId>
- <artifactId>commons-logging</artifactId>
- </dependency>
- <dependency>
- <groupId>log4j</groupId>
- <artifactId>log4j</artifactId>
- </dependency>
-
- </dependencies>
-
- <profiles>
- <profile>
- <id>release.prepare</id>
- <properties>
- <mahout.skip.example>true</mahout.skip.example>
- </properties>
- </profile>
- </profiles>
-</project>

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/assembly/job.xml
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/assembly/job.xml b/community/mahout-mr/examples/src/main/assembly/job.xml
deleted file mode 100644
index 0c41f3d..0000000
--- a/community/mahout-mr/examples/src/main/assembly/job.xml
+++ /dev/null
@@ -1,46 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<assembly
- xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0"
- xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0
- http://maven.apache.org/xsd/assembly-1.1.0.xsd">
- <id>job</id>
- <formats>
- <format>jar</format>
- </formats>
- <includeBaseDirectory>false</includeBaseDirectory>
- <dependencySets>
- <dependencySet>
- <unpack>true</unpack>
- <unpackOptions>
- <!-- MAHOUT-1126 -->
- <excludes>
- <exclude>META-INF/LICENSE</exclude>
- </excludes>
- </unpackOptions>
- <scope>runtime</scope>
- <outputDirectory>/</outputDirectory>
- <useTransitiveFiltering>true</useTransitiveFiltering>
- <excludes>
- <exclude>org.apache.hadoop:hadoop-core</exclude>
- </excludes>
- </dependencySet>
- </dependencySets>
-</assembly>
-
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/TasteOptionParser.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/TasteOptionParser.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/TasteOptionParser.java
deleted file mode 100644
index 6392b9f..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/TasteOptionParser.java
+++ /dev/null
@@ -1,75 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example;
-
-import java.io.File;
-
-import org.apache.commons.cli2.CommandLine;
-import org.apache.commons.cli2.Group;
-import org.apache.commons.cli2.Option;
-import org.apache.commons.cli2.OptionException;
-import org.apache.commons.cli2.builder.ArgumentBuilder;
-import org.apache.commons.cli2.builder.DefaultOptionBuilder;
-import org.apache.commons.cli2.builder.GroupBuilder;
-import org.apache.commons.cli2.commandline.Parser;
-import org.apache.mahout.common.CommandLineUtil;
-import org.apache.mahout.common.commandline.DefaultOptionCreator;
-
-/**
- * This class provides a common implementation for parsing input parameters for
- * all taste examples. Currently they only need the path to the recommendations
- * file as input.
- *
- * The class is safe to be used in threaded contexts.
- */
-public final class TasteOptionParser {
-
- private TasteOptionParser() {
- }
-
- /**
- * Parse the given command line arguments.
- * @param args the arguments as given to the application.
- * @return the input file if a file was given on the command line, null otherwise.
- */
- public static File getRatings(String[] args) throws OptionException {
- DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
- ArgumentBuilder abuilder = new ArgumentBuilder();
- GroupBuilder gbuilder = new GroupBuilder();
-
- Option inputOpt = obuilder.withLongName("input").withRequired(false).withShortName("i")
- .withArgument(abuilder.withName("input").withMinimum(1).withMaximum(1).create())
- .withDescription("The Path for input data directory.").create();
-
- Option helpOpt = DefaultOptionCreator.helpOption();
-
- Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(helpOpt).create();
-
- Parser parser = new Parser();
- parser.setGroup(group);
- CommandLine cmdLine = parser.parse(args);
-
- if (cmdLine.hasOption(helpOpt)) {
- CommandLineUtil.printHelp(group);
- return null;
- }
-
- return cmdLine.hasOption(inputOpt) ? new File(cmdLine.getValue(inputOpt).toString()) : null;
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingBooleanRecommender.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingBooleanRecommender.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingBooleanRecommender.java
deleted file mode 100644
index c908e5b..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingBooleanRecommender.java
+++ /dev/null
@@ -1,102 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.bookcrossing;
-
-import org.apache.mahout.cf.taste.common.Refreshable;
-import org.apache.mahout.cf.taste.common.TasteException;
-import org.apache.mahout.cf.taste.impl.neighborhood.NearestNUserNeighborhood;
-import org.apache.mahout.cf.taste.impl.recommender.GenericBooleanPrefUserBasedRecommender;
-import org.apache.mahout.cf.taste.impl.similarity.CachingUserSimilarity;
-import org.apache.mahout.cf.taste.impl.similarity.LogLikelihoodSimilarity;
-import org.apache.mahout.cf.taste.model.DataModel;
-import org.apache.mahout.cf.taste.neighborhood.UserNeighborhood;
-import org.apache.mahout.cf.taste.recommender.IDRescorer;
-import org.apache.mahout.cf.taste.recommender.RecommendedItem;
-import org.apache.mahout.cf.taste.recommender.Recommender;
-import org.apache.mahout.cf.taste.similarity.UserSimilarity;
-
-import java.util.Collection;
-import java.util.List;
-
-/**
- * A simple {@link Recommender} implemented for the Book Crossing demo.
- * See the <a href="http://www.informatik.uni-freiburg.de/~cziegler/BX/">Book Crossing site</a>.
- */
-public final class BookCrossingBooleanRecommender implements Recommender {
-
- private final Recommender recommender;
-
- public BookCrossingBooleanRecommender(DataModel bcModel) throws TasteException {
- UserSimilarity similarity = new CachingUserSimilarity(new LogLikelihoodSimilarity(bcModel), bcModel);
- UserNeighborhood neighborhood =
- new NearestNUserNeighborhood(10, Double.NEGATIVE_INFINITY, similarity, bcModel, 1.0);
- recommender = new GenericBooleanPrefUserBasedRecommender(bcModel, neighborhood, similarity);
- }
-
- @Override
- public List<RecommendedItem> recommend(long userID, int howMany) throws TasteException {
- return recommender.recommend(userID, howMany);
- }
-
- @Override
- public List<RecommendedItem> recommend(long userID, int howMany, boolean includeKnownItems) throws TasteException {
- return recommend(userID, howMany, null, includeKnownItems);
- }
-
- @Override
- public List<RecommendedItem> recommend(long userID, int howMany, IDRescorer rescorer) throws TasteException {
- return recommender.recommend(userID, howMany, rescorer, false);
- }
-
- @Override
- public List<RecommendedItem> recommend(long userID, int howMany, IDRescorer rescorer, boolean includeKnownItems)
- throws TasteException {
- return recommender.recommend(userID, howMany, rescorer, includeKnownItems);
- }
-
- @Override
- public float estimatePreference(long userID, long itemID) throws TasteException {
- return recommender.estimatePreference(userID, itemID);
- }
-
- @Override
- public void setPreference(long userID, long itemID, float value) throws TasteException {
- recommender.setPreference(userID, itemID, value);
- }
-
- @Override
- public void removePreference(long userID, long itemID) throws TasteException {
- recommender.removePreference(userID, itemID);
- }
-
- @Override
- public DataModel getDataModel() {
- return recommender.getDataModel();
- }
-
- @Override
- public void refresh(Collection<Refreshable> alreadyRefreshed) {
- recommender.refresh(alreadyRefreshed);
- }
-
- @Override
- public String toString() {
- return "BookCrossingBooleanRecommender[recommender:" + recommender + ']';
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingBooleanRecommenderBuilder.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingBooleanRecommenderBuilder.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingBooleanRecommenderBuilder.java
deleted file mode 100644
index 2219bce..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingBooleanRecommenderBuilder.java
+++ /dev/null
@@ -1,32 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.bookcrossing;
-
-import org.apache.mahout.cf.taste.common.TasteException;
-import org.apache.mahout.cf.taste.eval.RecommenderBuilder;
-import org.apache.mahout.cf.taste.model.DataModel;
-import org.apache.mahout.cf.taste.recommender.Recommender;
-
-final class BookCrossingBooleanRecommenderBuilder implements RecommenderBuilder {
-
- @Override
- public Recommender buildRecommender(DataModel dataModel) throws TasteException {
- return new BookCrossingBooleanRecommender(dataModel);
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingBooleanRecommenderEvaluatorRunner.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingBooleanRecommenderEvaluatorRunner.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingBooleanRecommenderEvaluatorRunner.java
deleted file mode 100644
index b9814c7..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingBooleanRecommenderEvaluatorRunner.java
+++ /dev/null
@@ -1,59 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.bookcrossing;
-
-import org.apache.commons.cli2.OptionException;
-import org.apache.mahout.cf.taste.common.TasteException;
-import org.apache.mahout.cf.taste.eval.IRStatistics;
-import org.apache.mahout.cf.taste.eval.RecommenderIRStatsEvaluator;
-import org.apache.mahout.cf.taste.example.TasteOptionParser;
-import org.apache.mahout.cf.taste.impl.eval.GenericRecommenderIRStatsEvaluator;
-import org.apache.mahout.cf.taste.model.DataModel;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.File;
-import java.io.IOException;
-
-public final class BookCrossingBooleanRecommenderEvaluatorRunner {
-
- private static final Logger log = LoggerFactory.getLogger(BookCrossingBooleanRecommenderEvaluatorRunner.class);
-
- private BookCrossingBooleanRecommenderEvaluatorRunner() {
- // do nothing
- }
-
- public static void main(String... args) throws IOException, TasteException, OptionException {
- RecommenderIRStatsEvaluator evaluator = new GenericRecommenderIRStatsEvaluator();
- File ratingsFile = TasteOptionParser.getRatings(args);
- DataModel model =
- ratingsFile == null ? new BookCrossingDataModel(true) : new BookCrossingDataModel(ratingsFile, true);
-
- IRStatistics evaluation = evaluator.evaluate(
- new BookCrossingBooleanRecommenderBuilder(),
- new BookCrossingDataModelBuilder(),
- model,
- null,
- 3,
- Double.NEGATIVE_INFINITY,
- 1.0);
-
- log.info(String.valueOf(evaluation));
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingDataModel.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingDataModel.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingDataModel.java
deleted file mode 100644
index 3e2f8b5..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingDataModel.java
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.bookcrossing;
-
-import java.io.File;
-import java.io.FileNotFoundException;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.OutputStreamWriter;
-import java.io.Writer;
-import java.util.regex.Pattern;
-
-import com.google.common.base.Charsets;
-import com.google.common.io.Closeables;
-import org.apache.mahout.cf.taste.similarity.precompute.example.GroupLensDataModel;
-import org.apache.mahout.cf.taste.impl.model.file.FileDataModel;
-import org.apache.mahout.common.iterator.FileLineIterable;
-
-/**
- * See <a href="http://www.informatik.uni-freiburg.de/~cziegler/BX/BX-CSV-Dump.zip">download</a> for
- * data needed by this class. The BX-Book-Ratings.csv file is needed.
- */
-public final class BookCrossingDataModel extends FileDataModel {
-
- private static final Pattern NON_DIGIT_SEMICOLON_PATTERN = Pattern.compile("[^0-9;]");
-
- public BookCrossingDataModel(boolean ignoreRatings) throws IOException {
- this(GroupLensDataModel.readResourceToTempFile(
- "/org/apache/mahout/cf/taste/example/bookcrossing/BX-Book-Ratings.csv"),
- ignoreRatings);
- }
-
- /**
- * @param ratingsFile BookCrossing ratings file in its native format
- * @throws IOException if an error occurs while reading or writing files
- */
- public BookCrossingDataModel(File ratingsFile, boolean ignoreRatings) throws IOException {
- super(convertBCFile(ratingsFile, ignoreRatings));
- }
-
- private static File convertBCFile(File originalFile, boolean ignoreRatings) throws IOException {
- if (!originalFile.exists()) {
- throw new FileNotFoundException(originalFile.toString());
- }
- File resultFile = new File(new File(System.getProperty("java.io.tmpdir")), "taste.bookcrossing.txt");
- resultFile.delete();
- Writer writer = null;
- try {
- writer = new OutputStreamWriter(new FileOutputStream(resultFile), Charsets.UTF_8);
- for (String line : new FileLineIterable(originalFile, true)) {
- // 0 ratings are basically "no rating", ignore them (thanks h.9000)
- if (line.endsWith("\"0\"")) {
- continue;
- }
- // Delete replace anything that isn't numeric, or a semicolon delimiter. Make comma the delimiter.
- String convertedLine = NON_DIGIT_SEMICOLON_PATTERN.matcher(line)
- .replaceAll("").replace(';', ',');
- // If this means we deleted an entire ID -- few cases like that -- skip the line
- if (convertedLine.contains(",,")) {
- continue;
- }
- if (ignoreRatings) {
- // drop rating
- convertedLine = convertedLine.substring(0, convertedLine.lastIndexOf(','));
- }
- writer.write(convertedLine);
- writer.write('\n');
- }
- writer.flush();
- } catch (IOException ioe) {
- resultFile.delete();
- throw ioe;
- } finally {
- Closeables.close(writer, false);
- }
- return resultFile;
- }
-
- @Override
- public String toString() {
- return "BookCrossingDataModel";
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingDataModelBuilder.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingDataModelBuilder.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingDataModelBuilder.java
deleted file mode 100644
index 9ec2eaf..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingDataModelBuilder.java
+++ /dev/null
@@ -1,33 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.bookcrossing;
-
-import org.apache.mahout.cf.taste.eval.DataModelBuilder;
-import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
-import org.apache.mahout.cf.taste.impl.model.GenericBooleanPrefDataModel;
-import org.apache.mahout.cf.taste.model.DataModel;
-import org.apache.mahout.cf.taste.model.PreferenceArray;
-
-final class BookCrossingDataModelBuilder implements DataModelBuilder {
-
- @Override
- public DataModel buildDataModel(FastByIDMap<PreferenceArray> trainingData) {
- return new GenericBooleanPrefDataModel(GenericBooleanPrefDataModel.toDataMap(trainingData));
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingRecommender.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingRecommender.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingRecommender.java
deleted file mode 100644
index c06ca2f..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingRecommender.java
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.bookcrossing;
-
-import java.util.Collection;
-import java.util.List;
-
-import org.apache.mahout.cf.taste.common.Refreshable;
-import org.apache.mahout.cf.taste.common.TasteException;
-import org.apache.mahout.cf.taste.impl.neighborhood.NearestNUserNeighborhood;
-import org.apache.mahout.cf.taste.impl.recommender.GenericUserBasedRecommender;
-import org.apache.mahout.cf.taste.impl.similarity.CachingUserSimilarity;
-import org.apache.mahout.cf.taste.impl.similarity.EuclideanDistanceSimilarity;
-import org.apache.mahout.cf.taste.model.DataModel;
-import org.apache.mahout.cf.taste.neighborhood.UserNeighborhood;
-import org.apache.mahout.cf.taste.recommender.IDRescorer;
-import org.apache.mahout.cf.taste.recommender.RecommendedItem;
-import org.apache.mahout.cf.taste.recommender.Recommender;
-import org.apache.mahout.cf.taste.similarity.UserSimilarity;
-
-/**
- * A simple {@link Recommender} implemented for the Book Crossing demo.
- * See the <a href="http://www.informatik.uni-freiburg.de/~cziegler/BX/">Book Crossing site</a>.
- */
-public final class BookCrossingRecommender implements Recommender {
-
- private final Recommender recommender;
-
- public BookCrossingRecommender(DataModel bcModel) throws TasteException {
- UserSimilarity similarity = new CachingUserSimilarity(new EuclideanDistanceSimilarity(bcModel), bcModel);
- UserNeighborhood neighborhood = new NearestNUserNeighborhood(10, 0.2, similarity, bcModel, 0.2);
- recommender = new GenericUserBasedRecommender(bcModel, neighborhood, similarity);
- }
-
- @Override
- public List<RecommendedItem> recommend(long userID, int howMany) throws TasteException {
- return recommender.recommend(userID, howMany);
- }
-
- @Override
- public List<RecommendedItem> recommend(long userID, int howMany, boolean includeKnownItems) throws TasteException {
- return recommend(userID, howMany, null, includeKnownItems);
- }
-
- @Override
- public List<RecommendedItem> recommend(long userID, int howMany, IDRescorer rescorer) throws TasteException {
- return recommender.recommend(userID, howMany, rescorer, false);
- }
-
- @Override
- public List<RecommendedItem> recommend(long userID, int howMany, IDRescorer rescorer, boolean includeKnownItems)
- throws TasteException {
- return recommender.recommend(userID, howMany, rescorer, false);
- }
-
- @Override
- public float estimatePreference(long userID, long itemID) throws TasteException {
- return recommender.estimatePreference(userID, itemID);
- }
-
- @Override
- public void setPreference(long userID, long itemID, float value) throws TasteException {
- recommender.setPreference(userID, itemID, value);
- }
-
- @Override
- public void removePreference(long userID, long itemID) throws TasteException {
- recommender.removePreference(userID, itemID);
- }
-
- @Override
- public DataModel getDataModel() {
- return recommender.getDataModel();
- }
-
- @Override
- public void refresh(Collection<Refreshable> alreadyRefreshed) {
- recommender.refresh(alreadyRefreshed);
- }
-
- @Override
- public String toString() {
- return "BookCrossingRecommender[recommender:" + recommender + ']';
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingRecommenderBuilder.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingRecommenderBuilder.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingRecommenderBuilder.java
deleted file mode 100644
index bb6d3e1..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingRecommenderBuilder.java
+++ /dev/null
@@ -1,32 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.bookcrossing;
-
-import org.apache.mahout.cf.taste.common.TasteException;
-import org.apache.mahout.cf.taste.eval.RecommenderBuilder;
-import org.apache.mahout.cf.taste.model.DataModel;
-import org.apache.mahout.cf.taste.recommender.Recommender;
-
-final class BookCrossingRecommenderBuilder implements RecommenderBuilder {
-
- @Override
- public Recommender buildRecommender(DataModel dataModel) throws TasteException {
- return new BookCrossingRecommender(dataModel);
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingRecommenderEvaluatorRunner.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingRecommenderEvaluatorRunner.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingRecommenderEvaluatorRunner.java
deleted file mode 100644
index 97074d2..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingRecommenderEvaluatorRunner.java
+++ /dev/null
@@ -1,54 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.bookcrossing;
-
-import java.io.File;
-import java.io.IOException;
-
-import org.apache.commons.cli2.OptionException;
-import org.apache.mahout.cf.taste.common.TasteException;
-import org.apache.mahout.cf.taste.eval.RecommenderEvaluator;
-import org.apache.mahout.cf.taste.example.TasteOptionParser;
-import org.apache.mahout.cf.taste.impl.eval.AverageAbsoluteDifferenceRecommenderEvaluator;
-import org.apache.mahout.cf.taste.model.DataModel;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-public final class BookCrossingRecommenderEvaluatorRunner {
-
- private static final Logger log = LoggerFactory.getLogger(BookCrossingRecommenderEvaluatorRunner.class);
-
- private BookCrossingRecommenderEvaluatorRunner() {
- // do nothing
- }
-
- public static void main(String... args) throws IOException, TasteException, OptionException {
- RecommenderEvaluator evaluator = new AverageAbsoluteDifferenceRecommenderEvaluator();
- File ratingsFile = TasteOptionParser.getRatings(args);
- DataModel model =
- ratingsFile == null ? new BookCrossingDataModel(false) : new BookCrossingDataModel(ratingsFile, false);
-
- double evaluation = evaluator.evaluate(new BookCrossingRecommenderBuilder(),
- null,
- model,
- 0.9,
- 0.3);
- log.info(String.valueOf(evaluation));
- }
-
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/README
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/README b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/README
deleted file mode 100644
index 9244fe3..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/README
+++ /dev/null
@@ -1,9 +0,0 @@
-Code works with BookCrossing data set, which is not included in this distribution but is downloadable from
-http://www.informatik.uni-freiburg.de/~cziegler/BX/
-
-Data set originated from:
-
-Improving Recommendation Lists Through Topic Diversification,
- Cai-Nicolas Ziegler, Sean M. McNee, Joseph A. Konstan, Georg Lausen;
- Proceedings of the 14th International World Wide Web Conference (WWW '05), May 10-14, 2005, Chiba, Japan.
- To appear.
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/EmailUtility.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/EmailUtility.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/EmailUtility.java
deleted file mode 100644
index 033daa2..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/EmailUtility.java
+++ /dev/null
@@ -1,104 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.email;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.IntWritable;
-import org.apache.hadoop.io.Writable;
-import org.apache.mahout.common.HadoopUtil;
-import org.apache.mahout.common.Pair;
-import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable;
-import org.apache.mahout.math.map.OpenObjectIntHashMap;
-
-import java.io.IOException;
-import java.util.regex.Pattern;
-
-public final class EmailUtility {
-
- public static final String SEPARATOR = "separator";
- public static final String MSG_IDS_PREFIX = "msgIdsPrefix";
- public static final String FROM_PREFIX = "fromPrefix";
- public static final String MSG_ID_DIMENSION = "msgIdDim";
- public static final String FROM_INDEX = "fromIdx";
- public static final String REFS_INDEX = "refsIdx";
- private static final String[] EMPTY = new String[0];
- private static final Pattern ADDRESS_CLEANUP = Pattern.compile("mailto:|<|>|\\[|\\]|\\=20");
- private static final Pattern ANGLE_BRACES = Pattern.compile("<|>");
- private static final Pattern SPACE_OR_CLOSE_ANGLE = Pattern.compile(">|\\s+");
- public static final Pattern WHITESPACE = Pattern.compile("\\s*");
-
- private EmailUtility() {
- }
-
- /**
- * Strip off some spurious characters that make it harder to dedup
- */
- public static String cleanUpEmailAddress(CharSequence address) {
- //do some cleanup to normalize some things, like: Key: karthik ananth <***@gmail.com>: Value: 178
- //Key: karthik ananth [mailto:***@gmail.com]=20: Value: 179
- //TODO: is there more to clean up here?
- return ADDRESS_CLEANUP.matcher(address).replaceAll("");
- }
-
- public static void loadDictionaries(Configuration conf, String fromPrefix,
- OpenObjectIntHashMap<String> fromDictionary,
- String msgIdPrefix,
- OpenObjectIntHashMap<String> msgIdDictionary) throws IOException {
-
- Path[] localFiles = HadoopUtil.getCachedFiles(conf);
- FileSystem fs = FileSystem.getLocal(conf);
- for (Path dictionaryFile : localFiles) {
-
- // key is word value is id
-
- OpenObjectIntHashMap<String> dictionary = null;
- if (dictionaryFile.getName().startsWith(fromPrefix)) {
- dictionary = fromDictionary;
- } else if (dictionaryFile.getName().startsWith(msgIdPrefix)) {
- dictionary = msgIdDictionary;
- }
- if (dictionary != null) {
- dictionaryFile = fs.makeQualified(dictionaryFile);
- for (Pair<Writable, IntWritable> record
- : new SequenceFileIterable<Writable, IntWritable>(dictionaryFile, true, conf)) {
- dictionary.put(record.getFirst().toString(), record.getSecond().get());
- }
- }
- }
-
- }
-
- public static String[] parseReferences(CharSequence rawRefs) {
- String[] splits;
- if (rawRefs != null && rawRefs.length() > 0) {
- splits = SPACE_OR_CLOSE_ANGLE.split(rawRefs);
- for (int i = 0; i < splits.length; i++) {
- splits[i] = ANGLE_BRACES.matcher(splits[i]).replaceAll("");
- }
- } else {
- splits = EMPTY;
- }
- return splits;
- }
-
- public enum Counters {
- NO_MESSAGE_ID, NO_FROM_ADDRESS
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/FromEmailToDictionaryMapper.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/FromEmailToDictionaryMapper.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/FromEmailToDictionaryMapper.java
deleted file mode 100644
index 5cd308d..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/FromEmailToDictionaryMapper.java
+++ /dev/null
@@ -1,61 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.email;
-
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.Mapper;
-import org.apache.mahout.math.VarIntWritable;
-
-import java.io.IOException;
-
-/**
- * Assumes the input is in the format created by {@link org.apache.mahout.text.SequenceFilesFromMailArchives}
- */
-public final class FromEmailToDictionaryMapper extends Mapper<Text, Text, Text, VarIntWritable> {
-
- private String separator;
-
- @Override
- protected void setup(Context context) throws IOException, InterruptedException {
- super.setup(context);
- separator = context.getConfiguration().get(EmailUtility.SEPARATOR);
- }
-
- @Override
- protected void map(Text key, Text value, Context context) throws IOException, InterruptedException {
- //From is in the value
- String valStr = value.toString();
- int idx = valStr.indexOf(separator);
- if (idx == -1) {
- context.getCounter(EmailUtility.Counters.NO_FROM_ADDRESS).increment(1);
- } else {
- String full = valStr.substring(0, idx);
- //do some cleanup to normalize some things, like: Key: karthik ananth <***@gmail.com>: Value: 178
- //Key: karthik ananth [mailto:***@gmail.com]=20: Value: 179
- //TODO: is there more to clean up here?
- full = EmailUtility.cleanUpEmailAddress(full);
-
- if (EmailUtility.WHITESPACE.matcher(full).matches()) {
- context.getCounter(EmailUtility.Counters.NO_FROM_ADDRESS).increment(1);
- } else {
- context.write(new Text(full), new VarIntWritable(1));
- }
- }
-
- }
-}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToDictionaryReducer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToDictionaryReducer.java b/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToDictionaryReducer.java
deleted file mode 100644
index 72fcde9..0000000
--- a/community/mahout-mr/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToDictionaryReducer.java
+++ /dev/null
@@ -1,43 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mahout.cf.taste.example.email;
-
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.Reducer;
-import org.apache.mahout.math.VarIntWritable;
-
-import java.io.IOException;
-
-/**
- * Key: the string id
- * Value: the count
- * Out Key: the string id
- * Out Value: the sum of the counts
- */
-public final class MailToDictionaryReducer extends Reducer<Text, VarIntWritable, Text, VarIntWritable> {
-
- @Override
- protected void reduce(Text key, Iterable<VarIntWritable> values, Context context)
- throws IOException, InterruptedException {
- int sum = 0;
- for (VarIntWritable value : values) {
- sum += value.get();
- }
- context.write(new Text(key), new VarIntWritable(sum));
- }
-}
r***@apache.org
2018-06-28 14:55:19 UTC
Permalink
NO-JIRA Clean up MR refactor


Project: http://git-wip-us.apache.org/repos/asf/mahout/repo
Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/410ed16a
Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/410ed16a
Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/410ed16a

Branch: refs/heads/branch-0.14.0
Commit: 410ed16af1fc587999868dd4990cebfa7d14633e
Parents: e0573de
Author: Trevor a.k.a @rawkintrevo <***@gmail.com>
Authored: Thu Jun 28 09:38:33 2018 -0500
Committer: Trevor a.k.a @rawkintrevo <***@gmail.com>
Committed: Thu Jun 28 09:38:33 2018 -0500

----------------------------------------------------------------------
community/mahout-mr/conf/log4j.xml | 17 +
community/mahout-mr/examples/bin/README.txt | 13 -
.../examples/bin/classify-20newsgroups.sh | 197 -
.../examples/bin/classify-wikipedia.sh | 196 -
.../mahout-mr/examples/bin/cluster-reuters.sh | 203 -
.../examples/bin/cluster-syntheticcontrol.sh | 105 -
.../examples/bin/factorize-movielens-1M.sh | 85 -
.../mahout-mr/examples/bin/factorize-netflix.sh | 90 -
.../mahout-mr/examples/bin/get-all-examples.sh | 36 -
community/mahout-mr/examples/bin/lda.algorithm | 45 -
.../examples/bin/resources/bank-full.csv | 45212 ---------
.../examples/bin/resources/country.txt | 229 -
.../examples/bin/resources/country10.txt | 10 -
.../examples/bin/resources/country2.txt | 2 -
.../examples/bin/resources/donut-test.csv | 41 -
.../mahout-mr/examples/bin/resources/donut.csv | 41 -
.../examples/bin/resources/test-data.csv | 61 -
.../mahout-mr/examples/bin/set-dfs-commands.sh | 54 -
community/mahout-mr/examples/pom.xml | 199 -
.../examples/src/main/assembly/job.xml | 46 -
.../cf/taste/example/TasteOptionParser.java | 75 -
.../BookCrossingBooleanRecommender.java | 102 -
.../BookCrossingBooleanRecommenderBuilder.java | 32 -
...ossingBooleanRecommenderEvaluatorRunner.java | 59 -
.../bookcrossing/BookCrossingDataModel.java | 99 -
.../BookCrossingDataModelBuilder.java | 33 -
.../bookcrossing/BookCrossingRecommender.java | 101 -
.../BookCrossingRecommenderBuilder.java | 32 -
.../BookCrossingRecommenderEvaluatorRunner.java | 54 -
.../mahout/cf/taste/example/bookcrossing/README | 9 -
.../cf/taste/example/email/EmailUtility.java | 104 -
.../email/FromEmailToDictionaryMapper.java | 61 -
.../example/email/MailToDictionaryReducer.java | 43 -
.../taste/example/email/MailToPrefsDriver.java | 274 -
.../cf/taste/example/email/MailToRecMapper.java | 101 -
.../taste/example/email/MailToRecReducer.java | 53 -
.../example/email/MsgIdToDictionaryMapper.java | 49 -
.../taste/example/kddcup/DataFileIterable.java | 44 -
.../taste/example/kddcup/DataFileIterator.java | 158 -
.../taste/example/kddcup/KDDCupDataModel.java | 231 -
.../mahout/cf/taste/example/kddcup/ToCSV.java | 77 -
.../kddcup/track1/EstimateConverter.java | 43 -
.../example/kddcup/track1/Track1Callable.java | 67 -
.../kddcup/track1/Track1Recommender.java | 94 -
.../kddcup/track1/Track1RecommenderBuilder.java | 32 -
.../track1/Track1RecommenderEvaluator.java | 108 -
.../Track1RecommenderEvaluatorRunner.java | 56 -
.../example/kddcup/track1/Track1Runner.java | 95 -
.../svd/DataModelFactorizablePreferences.java | 107 -
.../track1/svd/FactorizablePreferences.java | 44 -
.../svd/KDDCupFactorizablePreferences.java | 123 -
.../track1/svd/ParallelArraysSGDFactorizer.java | 265 -
.../kddcup/track1/svd/Track1SVDRunner.java | 141 -
.../example/kddcup/track2/HybridSimilarity.java | 62 -
.../example/kddcup/track2/Track2Callable.java | 106 -
.../kddcup/track2/Track2Recommender.java | 100 -
.../kddcup/track2/Track2RecommenderBuilder.java | 33 -
.../example/kddcup/track2/Track2Runner.java | 100 -
.../taste/example/kddcup/track2/TrackData.java | 71 -
.../kddcup/track2/TrackItemSimilarity.java | 106 -
.../taste/example/kddcup/track2/UserResult.java | 54 -
.../als/netflix/NetflixDatasetConverter.java | 140 -
.../example/BatchItemSimilaritiesGroupLens.java | 65 -
.../precompute/example/GroupLensDataModel.java | 96 -
.../mahout/classifier/NewsgroupHelper.java | 128 -
.../classifier/email/PrepEmailMapper.java | 65 -
.../classifier/email/PrepEmailReducer.java | 47 -
.../email/PrepEmailVectorsDriver.java | 76 -
.../sequencelearning/hmm/PosTagger.java | 277 -
.../sgd/AdaptiveLogisticModelParameters.java | 236 -
.../classifier/sgd/LogisticModelParameters.java | 265 -
.../classifier/sgd/PrintResourceOrFile.java | 42 -
.../classifier/sgd/RunAdaptiveLogistic.java | 197 -
.../mahout/classifier/sgd/RunLogistic.java | 163 -
.../apache/mahout/classifier/sgd/SGDHelper.java | 151 -
.../apache/mahout/classifier/sgd/SGDInfo.java | 59 -
.../classifier/sgd/SimpleCsvExamples.java | 283 -
.../mahout/classifier/sgd/TestASFEmail.java | 152 -
.../mahout/classifier/sgd/TestNewsGroups.java | 141 -
.../mahout/classifier/sgd/TrainASFEmail.java | 137 -
.../classifier/sgd/TrainAdaptiveLogistic.java | 377 -
.../mahout/classifier/sgd/TrainLogistic.java | 311 -
.../mahout/classifier/sgd/TrainNewsGroups.java | 154 -
.../sgd/ValidateAdaptiveLogistic.java | 218 -
.../BankMarketingClassificationMain.java | 70 -
.../sgd/bankmarketing/TelephoneCall.java | 104 -
.../sgd/bankmarketing/TelephoneCallParser.java | 66 -
.../clustering/display/ClustersFilter.java | 31 -
.../clustering/display/DisplayCanopy.java | 88 -
.../clustering/display/DisplayClustering.java | 374 -
.../clustering/display/DisplayFuzzyKMeans.java | 110 -
.../clustering/display/DisplayKMeans.java | 106 -
.../display/DisplaySpectralKMeans.java | 85 -
.../apache/mahout/clustering/display/README.txt | 22 -
.../tools/ClusterQualitySummarizer.java | 279 -
.../clustering/streaming/tools/IOUtils.java | 80 -
.../clustering/syntheticcontrol/canopy/Job.java | 125 -
.../syntheticcontrol/fuzzykmeans/Job.java | 144 -
.../clustering/syntheticcontrol/kmeans/Job.java | 187 -
.../fpm/pfpgrowth/DeliciousTagsExample.java | 94 -
.../dataset/KeyBasedStringTupleCombiner.java | 40 -
.../dataset/KeyBasedStringTupleGrouper.java | 77 -
.../dataset/KeyBasedStringTupleMapper.java | 90 -
.../dataset/KeyBasedStringTupleReducer.java | 74 -
.../examples/src/main/resources/bank-full.csv | 45212 ---------
.../src/main/resources/cf-data-purchase.txt | 7 -
.../src/main/resources/cf-data-view.txt | 12 -
.../examples/src/main/resources/donut-test.csv | 41 -
.../examples/src/main/resources/donut.csv | 41 -
.../examples/src/main/resources/test-data.csv | 61 -
.../sgd/LogisticModelParametersTest.java | 43 -
.../classifier/sgd/ModelDissectorTest.java | 40 -
.../classifier/sgd/TrainLogisticTest.java | 167 -
.../clustering/display/ClustersFilterTest.java | 75 -
.../apache/mahout/examples/MahoutTestCase.java | 30 -
.../examples/src/test/resources/country.txt | 229 -
.../examples/src/test/resources/country10.txt | 10 -
.../examples/src/test/resources/country2.txt | 2 -
.../examples/src/test/resources/subjects.txt | 2 -
.../examples/src/test/resources/wdbc.infos | 32 -
.../examples/src/test/resources/wdbc/wdbc.data | 569 -
community/mahout-mr/integration/pom.xml | 2 +-
community/mahout-mr/mr-examples/bin/README.txt | 13 +
.../mr-examples/bin/classify-20newsgroups.sh | 197 +
.../mr-examples/bin/classify-wikipedia.sh | 196 +
.../mr-examples/bin/cluster-reuters.sh | 203 +
.../mr-examples/bin/cluster-syntheticcontrol.sh | 105 +
.../mr-examples/bin/factorize-movielens-1M.sh | 85 +
.../mr-examples/bin/factorize-netflix.sh | 90 +
.../mr-examples/bin/get-all-examples.sh | 36 +
.../mahout-mr/mr-examples/bin/lda.algorithm | 45 +
.../mr-examples/bin/resources/bank-full.csv | 45212 +++++++++
.../mr-examples/bin/resources/country.txt | 229 +
.../mr-examples/bin/resources/country10.txt | 10 +
.../mr-examples/bin/resources/country2.txt | 2 +
.../mr-examples/bin/resources/donut-test.csv | 41 +
.../mr-examples/bin/resources/donut.csv | 41 +
.../mr-examples/bin/resources/test-data.csv | 61 +
.../mr-examples/bin/set-dfs-commands.sh | 54 +
community/mahout-mr/mr-examples/pom.xml | 121 +
.../mr-examples/src/main/assembly/job.xml | 46 +
.../cf/taste/example/TasteOptionParser.java | 75 +
.../BookCrossingBooleanRecommender.java | 102 +
.../BookCrossingBooleanRecommenderBuilder.java | 32 +
...ossingBooleanRecommenderEvaluatorRunner.java | 59 +
.../bookcrossing/BookCrossingDataModel.java | 99 +
.../BookCrossingDataModelBuilder.java | 33 +
.../bookcrossing/BookCrossingRecommender.java | 101 +
.../BookCrossingRecommenderBuilder.java | 32 +
.../BookCrossingRecommenderEvaluatorRunner.java | 54 +
.../mahout/cf/taste/example/bookcrossing/README | 9 +
.../cf/taste/example/email/EmailUtility.java | 104 +
.../email/FromEmailToDictionaryMapper.java | 61 +
.../example/email/MailToDictionaryReducer.java | 43 +
.../taste/example/email/MailToPrefsDriver.java | 274 +
.../cf/taste/example/email/MailToRecMapper.java | 101 +
.../taste/example/email/MailToRecReducer.java | 53 +
.../example/email/MsgIdToDictionaryMapper.java | 49 +
.../taste/example/kddcup/DataFileIterable.java | 44 +
.../taste/example/kddcup/DataFileIterator.java | 158 +
.../taste/example/kddcup/KDDCupDataModel.java | 231 +
.../mahout/cf/taste/example/kddcup/ToCSV.java | 77 +
.../kddcup/track1/EstimateConverter.java | 43 +
.../example/kddcup/track1/Track1Callable.java | 67 +
.../kddcup/track1/Track1Recommender.java | 94 +
.../kddcup/track1/Track1RecommenderBuilder.java | 32 +
.../track1/Track1RecommenderEvaluator.java | 108 +
.../Track1RecommenderEvaluatorRunner.java | 56 +
.../example/kddcup/track1/Track1Runner.java | 95 +
.../svd/DataModelFactorizablePreferences.java | 107 +
.../track1/svd/FactorizablePreferences.java | 44 +
.../svd/KDDCupFactorizablePreferences.java | 123 +
.../track1/svd/ParallelArraysSGDFactorizer.java | 265 +
.../kddcup/track1/svd/Track1SVDRunner.java | 141 +
.../example/kddcup/track2/HybridSimilarity.java | 62 +
.../example/kddcup/track2/Track2Callable.java | 106 +
.../kddcup/track2/Track2Recommender.java | 100 +
.../kddcup/track2/Track2RecommenderBuilder.java | 33 +
.../example/kddcup/track2/Track2Runner.java | 100 +
.../taste/example/kddcup/track2/TrackData.java | 71 +
.../kddcup/track2/TrackItemSimilarity.java | 106 +
.../taste/example/kddcup/track2/UserResult.java | 54 +
.../als/netflix/NetflixDatasetConverter.java | 140 +
.../example/BatchItemSimilaritiesGroupLens.java | 65 +
.../precompute/example/GroupLensDataModel.java | 96 +
.../mahout/classifier/NewsgroupHelper.java | 128 +
.../classifier/email/PrepEmailMapper.java | 65 +
.../classifier/email/PrepEmailReducer.java | 47 +
.../email/PrepEmailVectorsDriver.java | 76 +
.../sequencelearning/hmm/PosTagger.java | 277 +
.../sgd/AdaptiveLogisticModelParameters.java | 236 +
.../classifier/sgd/LogisticModelParameters.java | 265 +
.../classifier/sgd/PrintResourceOrFile.java | 42 +
.../classifier/sgd/RunAdaptiveLogistic.java | 197 +
.../mahout/classifier/sgd/RunLogistic.java | 163 +
.../apache/mahout/classifier/sgd/SGDHelper.java | 151 +
.../apache/mahout/classifier/sgd/SGDInfo.java | 59 +
.../classifier/sgd/SimpleCsvExamples.java | 283 +
.../mahout/classifier/sgd/TestASFEmail.java | 152 +
.../mahout/classifier/sgd/TestNewsGroups.java | 141 +
.../mahout/classifier/sgd/TrainASFEmail.java | 137 +
.../classifier/sgd/TrainAdaptiveLogistic.java | 377 +
.../mahout/classifier/sgd/TrainLogistic.java | 311 +
.../mahout/classifier/sgd/TrainNewsGroups.java | 154 +
.../sgd/ValidateAdaptiveLogistic.java | 218 +
.../BankMarketingClassificationMain.java | 70 +
.../sgd/bankmarketing/TelephoneCall.java | 104 +
.../sgd/bankmarketing/TelephoneCallParser.java | 66 +
.../clustering/display/ClustersFilter.java | 31 +
.../clustering/display/DisplayCanopy.java | 88 +
.../clustering/display/DisplayClustering.java | 374 +
.../clustering/display/DisplayFuzzyKMeans.java | 110 +
.../clustering/display/DisplayKMeans.java | 106 +
.../display/DisplaySpectralKMeans.java | 85 +
.../apache/mahout/clustering/display/README.txt | 22 +
.../tools/ClusterQualitySummarizer.java | 279 +
.../clustering/streaming/tools/IOUtils.java | 80 +
.../clustering/syntheticcontrol/canopy/Job.java | 125 +
.../syntheticcontrol/fuzzykmeans/Job.java | 144 +
.../clustering/syntheticcontrol/kmeans/Job.java | 187 +
.../fpm/pfpgrowth/DeliciousTagsExample.java | 94 +
.../dataset/KeyBasedStringTupleCombiner.java | 40 +
.../dataset/KeyBasedStringTupleGrouper.java | 77 +
.../dataset/KeyBasedStringTupleMapper.java | 90 +
.../dataset/KeyBasedStringTupleReducer.java | 74 +
.../src/main/resources/bank-full.csv | 45212 +++++++++
.../src/main/resources/cf-data-purchase.txt | 7 +
.../src/main/resources/cf-data-view.txt | 12 +
.../src/main/resources/donut-test.csv | 41 +
.../mr-examples/src/main/resources/donut.csv | 41 +
.../src/main/resources/test-data.csv | 61 +
.../sgd/LogisticModelParametersTest.java | 43 +
.../classifier/sgd/ModelDissectorTest.java | 40 +
.../classifier/sgd/TrainLogisticTest.java | 167 +
.../clustering/display/ClustersFilterTest.java | 75 +
.../apache/mahout/examples/MahoutTestCase.java | 30 +
.../mr-examples/src/test/resources/country.txt | 229 +
.../src/test/resources/country10.txt | 10 +
.../mr-examples/src/test/resources/country2.txt | 2 +
.../mr-examples/src/test/resources/subjects.txt | 2 +
.../mr-examples/src/test/resources/wdbc.infos | 32 +
.../src/test/resources/wdbc/wdbc.data | 569 +
community/mahout-mr/mr/pom.xml | 295 +
.../appended-resources/supplemental-models.xml | 279 +
.../mr/src/images/logos/ mahout-powered.svg | 630 +
.../mahout-mr/mr/src/images/logos/favicon.ico | Bin 0 -> 28838 bytes
.../mr/src/images/logos/favicon128.png | Bin 0 -> 5259 bytes
.../mahout-mr/mr/src/images/logos/favicon16.png | Bin 0 -> 1009 bytes
.../mahout-mr/mr/src/images/logos/favicon32.png | Bin 0 -> 1847 bytes
.../mahout-mr/mr/src/images/logos/favicon64.png | Bin 0 -> 3148 bytes
.../mr/src/images/logos/mahout-logo-100.png | Bin 0 -> 19477 bytes
.../mr/src/images/logos/mahout-logo-200.png | Bin 0 -> 46360 bytes
.../mr/src/images/logos/mahout-logo-300.png | Bin 0 -> 70139 bytes
.../mr/src/images/logos/mahout-logo-400.png | Bin 0 -> 55468 bytes
.../images/logos/mahout-logo-poweredby-100.png | Bin 0 -> 24623 bytes
.../images/logos/mahout-logo-poweredby-55.png | Bin 0 -> 11684 bytes
.../logos/mahout-logo-transparent-400.png | Bin 0 -> 61970 bytes
.../mr/src/images/logos/mahout-logo.svg | 627 +
.../mahout-mr/mr/src/main/assembly/job.xml | 61 +
.../mahout-mr/mr/src/main/assembly/src.xml | 64 +
.../main/java/org/apache/mahout/Version.java | 41 +
.../cf/taste/common/NoSuchItemException.java | 32 +
.../cf/taste/common/NoSuchUserException.java | 32 +
.../mahout/cf/taste/common/Refreshable.java | 53 +
.../mahout/cf/taste/common/TasteException.java | 41 +
.../mahout/cf/taste/common/Weighting.java | 31 +
.../mahout/cf/taste/eval/DataModelBuilder.java | 45 +
.../mahout/cf/taste/eval/IRStatistics.java | 80 +
.../cf/taste/eval/RecommenderBuilder.java | 45 +
.../cf/taste/eval/RecommenderEvaluator.java | 105 +
.../taste/eval/RecommenderIRStatsEvaluator.java | 64 +
.../taste/eval/RelevantItemsDataSplitter.java | 62 +
.../cf/taste/hadoop/EntityEntityWritable.java | 98 +
.../cf/taste/hadoop/EntityPrefWritable.java | 89 +
.../cf/taste/hadoop/MutableRecommendedItem.java | 81 +
.../taste/hadoop/RecommendedItemsWritable.java | 96 +
.../cf/taste/hadoop/TasteHadoopUtils.java | 84 +
.../cf/taste/hadoop/ToEntityPrefsMapper.java | 78 +
.../cf/taste/hadoop/ToItemPrefsMapper.java | 46 +
.../mahout/cf/taste/hadoop/TopItemsQueue.java | 60 +
.../apache/mahout/cf/taste/hadoop/als/ALS.java | 100 +
.../cf/taste/hadoop/als/DatasetSplitter.java | 158 +
.../hadoop/als/FactorizationEvaluator.java | 166 +
.../hadoop/als/MultithreadedSharingMapper.java | 62 +
.../hadoop/als/ParallelALSFactorizationJob.java | 414 +
.../cf/taste/hadoop/als/PredictionMapper.java | 145 +
.../cf/taste/hadoop/als/RecommenderJob.java | 110 +
.../cf/taste/hadoop/als/SharingMapper.java | 59 +
.../hadoop/als/SolveExplicitFeedbackMapper.java | 61 +
.../hadoop/als/SolveImplicitFeedbackMapper.java | 58 +
.../item/AggregateAndRecommendReducer.java | 220 +
.../mahout/cf/taste/hadoop/item/IDReader.java | 244 +
.../item/ItemFilterAsVectorAndPrefsReducer.java | 62 +
.../cf/taste/hadoop/item/ItemFilterMapper.java | 47 +
.../cf/taste/hadoop/item/ItemIDIndexMapper.java | 56 +
.../taste/hadoop/item/ItemIDIndexReducer.java | 48 +
.../hadoop/item/PartialMultiplyMapper.java | 57 +
.../item/PrefAndSimilarityColumnWritable.java | 85 +
.../cf/taste/hadoop/item/RecommenderJob.java | 337 +
.../item/SimilarityMatrixRowWrapperMapper.java | 54 +
.../taste/hadoop/item/ToUserVectorsReducer.java | 84 +
.../hadoop/item/ToVectorAndPrefReducer.java | 63 +
.../hadoop/item/UserVectorSplitterMapper.java | 116 +
.../hadoop/item/VectorAndPrefsWritable.java | 92 +
.../taste/hadoop/item/VectorOrPrefWritable.java | 104 +
.../preparation/PreparePreferenceMatrixJob.java | 115 +
.../hadoop/preparation/ToItemVectorsMapper.java | 56 +
.../preparation/ToItemVectorsReducer.java | 38 +
.../similarity/item/ItemSimilarityJob.java | 233 +
.../similarity/item/TopSimilarItemsQueue.java | 60 +
.../common/AbstractLongPrimitiveIterator.java | 27 +
.../mahout/cf/taste/impl/common/BitSet.java | 93 +
.../mahout/cf/taste/impl/common/Cache.java | 178 +
.../cf/taste/impl/common/FastByIDMap.java | 661 +
.../mahout/cf/taste/impl/common/FastIDSet.java | 426 +
.../mahout/cf/taste/impl/common/FastMap.java | 729 +
.../taste/impl/common/FixedRunningAverage.java | 83 +
.../common/FixedRunningAverageAndStdDev.java | 51 +
.../taste/impl/common/FullRunningAverage.java | 109 +
.../common/FullRunningAverageAndStdDev.java | 107 +
.../impl/common/InvertedRunningAverage.java | 58 +
.../common/InvertedRunningAverageAndStdDev.java | 63 +
.../impl/common/LongPrimitiveArrayIterator.java | 93 +
.../impl/common/LongPrimitiveIterator.java | 39 +
.../cf/taste/impl/common/RefreshHelper.java | 122 +
.../mahout/cf/taste/impl/common/Retriever.java | 36 +
.../cf/taste/impl/common/RunningAverage.java | 67 +
.../impl/common/RunningAverageAndStdDev.java | 36 +
.../common/SamplingLongPrimitiveIterator.java | 111 +
.../cf/taste/impl/common/SkippingIterator.java | 35 +
.../impl/common/WeightedRunningAverage.java | 100 +
.../common/WeightedRunningAverageAndStdDev.java | 89 +
.../impl/common/jdbc/AbstractJDBCComponent.java | 88 +
.../taste/impl/common/jdbc/EachRowIterator.java | 92 +
.../impl/common/jdbc/ResultSetIterator.java | 66 +
.../AbstractDifferenceRecommenderEvaluator.java | 276 +
...eAbsoluteDifferenceRecommenderEvaluator.java | 59 +
.../GenericRecommenderIRStatsEvaluator.java | 237 +
.../eval/GenericRelevantItemsDataSplitter.java | 83 +
.../cf/taste/impl/eval/IRStatisticsImpl.java | 95 +
.../mahout/cf/taste/impl/eval/LoadCallable.java | 40 +
.../cf/taste/impl/eval/LoadEvaluator.java | 61 +
.../cf/taste/impl/eval/LoadStatistics.java | 34 +
.../eval/OrderBasedRecommenderEvaluator.java | 431 +
.../impl/eval/RMSRecommenderEvaluator.java | 56 +
.../cf/taste/impl/eval/StatsCallable.java | 64 +
.../cf/taste/impl/model/AbstractDataModel.java | 53 +
.../cf/taste/impl/model/AbstractIDMigrator.java | 66 +
.../impl/model/AbstractJDBCIDMigrator.java | 108 +
.../impl/model/BooleanItemPreferenceArray.java | 234 +
.../cf/taste/impl/model/BooleanPreference.java | 64 +
.../impl/model/BooleanUserPreferenceArray.java | 234 +
.../impl/model/GenericBooleanPrefDataModel.java | 320 +
.../cf/taste/impl/model/GenericDataModel.java | 361 +
.../impl/model/GenericItemPreferenceArray.java | 301 +
.../cf/taste/impl/model/GenericPreference.java | 70 +
.../impl/model/GenericUserPreferenceArray.java | 307 +
.../cf/taste/impl/model/MemoryIDMigrator.java | 55 +
.../taste/impl/model/MySQLJDBCIDMigrator.java | 67 +
.../PlusAnonymousConcurrentUserDataModel.java | 352 +
.../impl/model/PlusAnonymousUserDataModel.java | 320 +
.../PlusAnonymousUserLongPrimitiveIterator.java | 90 +
.../cf/taste/impl/model/file/FileDataModel.java | 758 +
.../taste/impl/model/file/FileIDMigrator.java | 117 +
.../neighborhood/AbstractUserNeighborhood.java | 71 +
.../neighborhood/CachingUserNeighborhood.java | 69 +
.../neighborhood/NearestNUserNeighborhood.java | 122 +
.../neighborhood/ThresholdUserNeighborhood.java | 104 +
.../AbstractCandidateItemsStrategy.java | 57 +
.../impl/recommender/AbstractRecommender.java | 140 +
.../AllSimilarItemsCandidateItemsStrategy.java | 50 +
.../AllUnknownItemsCandidateItemsStrategy.java | 41 +
.../impl/recommender/ByRescoreComparator.java | 65 +
.../ByValueRecommendedItemComparator.java | 43 +
.../impl/recommender/CachingRecommender.java | 251 +
.../recommender/EstimatedPreferenceCapper.java | 46 +
.../GenericBooleanPrefItemBasedRecommender.java | 71 +
.../GenericBooleanPrefUserBasedRecommender.java | 82 +
.../GenericItemBasedRecommender.java | 378 +
.../recommender/GenericRecommendedItem.java | 76 +
.../GenericUserBasedRecommender.java | 247 +
.../recommender/ItemAverageRecommender.java | 199 +
.../recommender/ItemUserAverageRecommender.java | 240 +
.../cf/taste/impl/recommender/NullRescorer.java | 86 +
...ItemsNeighborhoodCandidateItemsStrategy.java | 48 +
.../impl/recommender/RandomRecommender.java | 97 +
.../SamplingCandidateItemsStrategy.java | 165 +
.../cf/taste/impl/recommender/SimilarUser.java | 80 +
.../cf/taste/impl/recommender/TopItems.java | 211 +
.../impl/recommender/svd/ALSWRFactorizer.java | 312 +
.../recommender/svd/AbstractFactorizer.java | 94 +
.../impl/recommender/svd/Factorization.java | 137 +
.../taste/impl/recommender/svd/Factorizer.java | 30 +
.../svd/FilePersistenceStrategy.java | 139 +
.../recommender/svd/NoPersistenceStrategy.java | 37 +
.../recommender/svd/ParallelSGDFactorizer.java | 340 +
.../recommender/svd/PersistenceStrategy.java | 46 +
.../recommender/svd/RatingSGDFactorizer.java | 221 +
.../recommender/svd/SVDPlusPlusFactorizer.java | 178 +
.../impl/recommender/svd/SVDPreference.java | 41 +
.../impl/recommender/svd/SVDRecommender.java | 185 +
.../impl/similarity/AbstractItemSimilarity.java | 64 +
.../impl/similarity/AbstractSimilarity.java | 343 +
.../similarity/AveragingPreferenceInferrer.java | 85 +
.../impl/similarity/CachingItemSimilarity.java | 111 +
.../impl/similarity/CachingUserSimilarity.java | 104 +
.../impl/similarity/CityBlockSimilarity.java | 98 +
.../similarity/EuclideanDistanceSimilarity.java | 67 +
.../impl/similarity/GenericItemSimilarity.java | 358 +
.../impl/similarity/GenericUserSimilarity.java | 238 +
.../similarity/LogLikelihoodSimilarity.java | 121 +
.../impl/similarity/LongPairMatchPredicate.java | 40 +
.../PearsonCorrelationSimilarity.java | 93 +
.../SpearmanCorrelationSimilarity.java | 135 +
.../TanimotoCoefficientSimilarity.java | 126 +
.../similarity/UncenteredCosineSimilarity.java | 69 +
.../file/FileItemItemSimilarityIterable.java | 46 +
.../file/FileItemItemSimilarityIterator.java | 60 +
.../similarity/file/FileItemSimilarity.java | 137 +
.../precompute/FileSimilarItemsWriter.java | 67 +
.../MultithreadedBatchItemSimilarities.java | 230 +
.../apache/mahout/cf/taste/model/DataModel.java | 199 +
.../mahout/cf/taste/model/IDMigrator.java | 63 +
.../mahout/cf/taste/model/JDBCDataModel.java | 43 +
.../mahout/cf/taste/model/Preference.java | 48 +
.../mahout/cf/taste/model/PreferenceArray.java | 143 +
.../cf/taste/model/UpdatableIDMigrator.java | 47 +
.../cf/taste/neighborhood/UserNeighborhood.java | 40 +
.../recommender/CandidateItemsStrategy.java | 37 +
.../mahout/cf/taste/recommender/IDRescorer.java | 47 +
.../taste/recommender/ItemBasedRecommender.java | 145 +
.../MostSimilarItemsCandidateItemsStrategy.java | 31 +
.../cf/taste/recommender/RecommendedItem.java | 41 +
.../cf/taste/recommender/Recommender.java | 132 +
.../mahout/cf/taste/recommender/Rescorer.java | 52 +
.../taste/recommender/UserBasedRecommender.java | 54 +
.../cf/taste/similarity/ItemSimilarity.java | 64 +
.../cf/taste/similarity/PreferenceInferrer.java | 47 +
.../cf/taste/similarity/UserSimilarity.java | 58 +
.../precompute/BatchItemSimilarities.java | 56 +
.../similarity/precompute/SimilarItem.java | 56 +
.../similarity/precompute/SimilarItems.java | 84 +
.../precompute/SimilarItemsWriter.java | 33 +
.../classifier/AbstractVectorClassifier.java | 248 +
.../mahout/classifier/ClassifierResult.java | 74 +
.../mahout/classifier/ConfusionMatrix.java | 444 +
.../apache/mahout/classifier/OnlineLearner.java | 96 +
.../classifier/RegressionResultAnalyzer.java | 144 +
.../mahout/classifier/ResultAnalyzer.java | 132 +
.../apache/mahout/classifier/df/Bagging.java | 61 +
.../apache/mahout/classifier/df/DFUtils.java | 174 +
.../mahout/classifier/df/DecisionForest.java | 241 +
.../mahout/classifier/df/ErrorEstimate.java | 51 +
.../df/builder/DecisionTreeBuilder.java | 422 +
.../df/builder/DefaultTreeBuilder.java | 253 +
.../classifier/df/builder/TreeBuilder.java | 42 +
.../apache/mahout/classifier/df/data/Data.java | 281 +
.../classifier/df/data/DataConverter.java | 72 +
.../mahout/classifier/df/data/DataLoader.java | 255 +
.../mahout/classifier/df/data/DataUtils.java | 89 +
.../mahout/classifier/df/data/Dataset.java | 422 +
.../classifier/df/data/DescriptorException.java | 28 +
.../classifier/df/data/DescriptorUtils.java | 110 +
.../mahout/classifier/df/data/Instance.java | 75 +
.../df/data/conditions/Condition.java | 57 +
.../classifier/df/data/conditions/Equals.java | 42 +
.../df/data/conditions/GreaterOrEquals.java | 42 +
.../classifier/df/data/conditions/Lesser.java | 42 +
.../mahout/classifier/df/mapreduce/Builder.java | 333 +
.../classifier/df/mapreduce/Classifier.java | 238 +
.../classifier/df/mapreduce/MapredMapper.java | 75 +
.../classifier/df/mapreduce/MapredOutput.java | 120 +
.../df/mapreduce/inmem/InMemBuilder.java | 114 +
.../df/mapreduce/inmem/InMemInputFormat.java | 284 +
.../df/mapreduce/inmem/InMemMapper.java | 106 +
.../df/mapreduce/inmem/package-info.java | 22 +
.../df/mapreduce/partial/PartialBuilder.java | 158 +
.../df/mapreduce/partial/Step1Mapper.java | 168 +
.../classifier/df/mapreduce/partial/TreeID.java | 58 +
.../df/mapreduce/partial/package-info.java | 16 +
.../classifier/df/node/CategoricalNode.java | 134 +
.../apache/mahout/classifier/df/node/Leaf.java | 95 +
.../apache/mahout/classifier/df/node/Node.java | 96 +
.../classifier/df/node/NumericalNode.java | 115 +
.../classifier/df/ref/SequentialBuilder.java | 78 +
.../classifier/df/split/DefaultIgSplit.java | 118 +
.../mahout/classifier/df/split/IgSplit.java | 35 +
.../mahout/classifier/df/split/OptIgSplit.java | 232 +
.../classifier/df/split/RegressionSplit.java | 177 +
.../mahout/classifier/df/split/Split.java | 68 +
.../mahout/classifier/df/tools/Describe.java | 166 +
.../classifier/df/tools/ForestVisualizer.java | 158 +
.../mahout/classifier/df/tools/Frequencies.java | 122 +
.../classifier/df/tools/FrequenciesJob.java | 297 +
.../classifier/df/tools/TreeVisualizer.java | 264 +
.../mahout/classifier/df/tools/UDistrib.java | 212 +
.../mahout/classifier/evaluation/Auc.java | 233 +
.../AbstractNaiveBayesClassifier.java | 82 +
.../classifier/naivebayes/BayesUtils.java | 161 +
.../ComplementaryNaiveBayesClassifier.java | 43 +
.../classifier/naivebayes/NaiveBayesModel.java | 170 +
.../StandardNaiveBayesClassifier.java | 40 +
.../naivebayes/test/BayesTestMapper.java | 76 +
.../naivebayes/test/TestNaiveBayesDriver.java | 176 +
.../training/ComplementaryThetaTrainer.java | 83 +
.../training/IndexInstancesMapper.java | 53 +
.../naivebayes/training/ThetaMapper.java | 61 +
.../naivebayes/training/TrainNaiveBayesJob.java | 177 +
.../naivebayes/training/WeightsMapper.java | 68 +
.../sequencelearning/hmm/BaumWelchTrainer.java | 161 +
.../sequencelearning/hmm/HmmAlgorithms.java | 306 +
.../sequencelearning/hmm/HmmEvaluator.java | 194 +
.../sequencelearning/hmm/HmmModel.java | 383 +
.../sequencelearning/hmm/HmmTrainer.java | 488 +
.../sequencelearning/hmm/HmmUtils.java | 360 +
.../hmm/LossyHmmSerializer.java | 62 +
.../hmm/RandomSequenceGenerator.java | 102 +
.../sequencelearning/hmm/ViterbiEvaluator.java | 122 +
.../sgd/AbstractOnlineLogisticRegression.java | 317 +
.../sgd/AdaptiveLogisticRegression.java | 586 +
.../mahout/classifier/sgd/CrossFoldLearner.java | 334 +
.../mahout/classifier/sgd/CsvRecordFactory.java | 395 +
.../mahout/classifier/sgd/DefaultGradient.java | 49 +
.../mahout/classifier/sgd/ElasticBandPrior.java | 76 +
.../apache/mahout/classifier/sgd/Gradient.java | 30 +
.../mahout/classifier/sgd/GradientMachine.java | 405 +
.../org/apache/mahout/classifier/sgd/L1.java | 59 +
.../org/apache/mahout/classifier/sgd/L2.java | 66 +
.../mahout/classifier/sgd/MixedGradient.java | 66 +
.../mahout/classifier/sgd/ModelDissector.java | 232 +
.../mahout/classifier/sgd/ModelSerializer.java | 67 +
.../sgd/OnlineLogisticRegression.java | 172 +
.../classifier/sgd/PassiveAggressive.java | 204 +
.../classifier/sgd/PolymorphicWritable.java | 46 +
.../mahout/classifier/sgd/PriorFunction.java | 45 +
.../mahout/classifier/sgd/RankingGradient.java | 85 +
.../mahout/classifier/sgd/RecordFactory.java | 47 +
.../apache/mahout/classifier/sgd/TPrior.java | 61 +
.../mahout/classifier/sgd/UniformPrior.java | 47 +
.../mahout/classifier/sgd/package-info.java | 23 +
.../mahout/clustering/AbstractCluster.java | 390 +
.../org/apache/mahout/clustering/Cluster.java | 90 +
.../mahout/clustering/ClusteringUtils.java | 306 +
.../mahout/clustering/GaussianAccumulator.java | 62 +
.../org/apache/mahout/clustering/Model.java | 93 +
.../mahout/clustering/ModelDistribution.java | 41 +
.../clustering/OnlineGaussianAccumulator.java | 107 +
.../RunningSumsGaussianAccumulator.java | 90 +
.../clustering/UncommonDistributions.java | 136 +
.../apache/mahout/clustering/canopy/Canopy.java | 60 +
.../clustering/canopy/CanopyClusterer.java | 220 +
.../clustering/canopy/CanopyConfigKeys.java | 70 +
.../mahout/clustering/canopy/CanopyDriver.java | 379 +
.../mahout/clustering/canopy/CanopyMapper.java | 66 +
.../mahout/clustering/canopy/CanopyReducer.java | 70 +
.../ClusterClassificationConfigKeys.java | 33 +
.../classify/ClusterClassificationDriver.java | 313 +
.../classify/ClusterClassificationMapper.java | 161 +
.../clustering/classify/ClusterClassifier.java | 231 +
.../WeightedPropertyVectorWritable.java | 95 +
.../classify/WeightedVectorWritable.java | 72 +
.../fuzzykmeans/FuzzyKMeansClusterer.java | 59 +
.../fuzzykmeans/FuzzyKMeansDriver.java | 324 +
.../clustering/fuzzykmeans/FuzzyKMeansUtil.java | 76 +
.../clustering/fuzzykmeans/SoftCluster.java | 60 +
.../iterator/AbstractClusteringPolicy.java | 72 +
.../mahout/clustering/iterator/CIMapper.java | 71 +
.../mahout/clustering/iterator/CIReducer.java | 64 +
.../iterator/CanopyClusteringPolicy.java | 52 +
.../clustering/iterator/ClusterIterator.java | 219 +
.../clustering/iterator/ClusterWritable.java | 56 +
.../clustering/iterator/ClusteringPolicy.java | 66 +
.../iterator/ClusteringPolicyWritable.java | 55 +
.../iterator/DistanceMeasureCluster.java | 91 +
.../iterator/FuzzyKMeansClusteringPolicy.java | 90 +
.../iterator/KMeansClusteringPolicy.java | 64 +
.../clustering/kernel/IKernelProfile.java | 27 +
.../kernel/TriangularKernelProfile.java | 27 +
.../mahout/clustering/kmeans/KMeansDriver.java | 257 +
.../mahout/clustering/kmeans/KMeansUtil.java | 74 +
.../mahout/clustering/kmeans/Kluster.java | 117 +
.../clustering/kmeans/RandomSeedGenerator.java | 136 +
.../mahout/clustering/kmeans/package-info.java | 5 +
.../lda/cvb/CVB0DocInferenceMapper.java | 51 +
.../mahout/clustering/lda/cvb/CVB0Driver.java | 536 +
.../CVB0TopicTermVectorNormalizerMapper.java | 38 +
.../clustering/lda/cvb/CachingCVB0Mapper.java | 133 +
.../lda/cvb/CachingCVB0PerplexityMapper.java | 108 +
.../cvb/InMemoryCollapsedVariationalBayes0.java | 492 +
.../mahout/clustering/lda/cvb/ModelTrainer.java | 301 +
.../mahout/clustering/lda/cvb/TopicModel.java | 513 +
.../apache/mahout/clustering/package-info.java | 13 +
.../spectral/AffinityMatrixInputJob.java | 84 +
.../spectral/AffinityMatrixInputMapper.java | 78 +
.../spectral/AffinityMatrixInputReducer.java | 59 +
.../spectral/IntDoublePairWritable.java | 75 +
.../apache/mahout/clustering/spectral/Keys.java | 31 +
.../spectral/MatrixDiagonalizeJob.java | 108 +
.../clustering/spectral/UnitVectorizerJob.java | 79 +
.../mahout/clustering/spectral/VectorCache.java | 116 +
.../spectral/VectorMatrixMultiplicationJob.java | 139 +
.../clustering/spectral/VertexWritable.java | 101 +
.../spectral/kmeans/EigenSeedGenerator.java | 120 +
.../spectral/kmeans/SpectralKMeansDriver.java | 243 +
.../streaming/cluster/BallKMeans.java | 456 +
.../streaming/cluster/StreamingKMeans.java | 368 +
.../streaming/mapreduce/CentroidWritable.java | 88 +
.../mapreduce/StreamingKMeansDriver.java | 493 +
.../mapreduce/StreamingKMeansMapper.java | 102 +
.../mapreduce/StreamingKMeansReducer.java | 109 +
.../mapreduce/StreamingKMeansThread.java | 92 +
.../mapreduce/StreamingKMeansUtilsMR.java | 154 +
.../streaming/tools/ResplitSequenceFiles.java | 149 +
.../clustering/topdown/PathDirectory.java | 94 +
.../postprocessor/ClusterCountReader.java | 103 +
.../ClusterOutputPostProcessor.java | 139 +
.../ClusterOutputPostProcessorDriver.java | 182 +
.../ClusterOutputPostProcessorMapper.java | 58 +
.../ClusterOutputPostProcessorReducer.java | 62 +
.../org/apache/mahout/common/AbstractJob.java | 648 +
.../org/apache/mahout/common/ClassUtils.java | 61 +
.../apache/mahout/common/CommandLineUtil.java | 68 +
.../org/apache/mahout/common/HadoopUtil.java | 435 +
.../apache/mahout/common/IntPairWritable.java | 270 +
.../org/apache/mahout/common/IntegerTuple.java | 176 +
.../java/org/apache/mahout/common/LongPair.java | 80 +
.../org/apache/mahout/common/MemoryUtil.java | 99 +
.../java/org/apache/mahout/common/Pair.java | 99 +
.../org/apache/mahout/common/Parameters.java | 98 +
.../org/apache/mahout/common/StringTuple.java | 177 +
.../org/apache/mahout/common/StringUtils.java | 63 +
.../apache/mahout/common/TimingStatistics.java | 154 +
.../commandline/DefaultOptionCreator.java | 417 +
.../distance/ChebyshevDistanceMeasure.java | 63 +
.../common/distance/CosineDistanceMeasure.java | 119 +
.../mahout/common/distance/DistanceMeasure.java | 48 +
.../distance/EuclideanDistanceMeasure.java | 41 +
.../distance/MahalanobisDistanceMeasure.java | 197 +
.../distance/ManhattanDistanceMeasure.java | 70 +
.../distance/MinkowskiDistanceMeasure.java | 93 +
.../SquaredEuclideanDistanceMeasure.java | 59 +
.../distance/TanimotoDistanceMeasure.java | 69 +
.../distance/WeightedDistanceMeasure.java | 93 +
.../WeightedEuclideanDistanceMeasure.java | 51 +
.../WeightedManhattanDistanceMeasure.java | 53 +
.../iterator/CopyConstructorIterator.java | 64 +
.../common/iterator/CountingIterator.java | 43 +
.../common/iterator/FileLineIterable.java | 88 +
.../common/iterator/FileLineIterator.java | 167 +
.../iterator/FixedSizeSamplingIterator.java | 59 +
.../common/iterator/SamplingIterable.java | 45 +
.../common/iterator/SamplingIterator.java | 73 +
.../StableFixedSizeSamplingIterator.java | 72 +
.../common/iterator/StringRecordIterator.java | 55 +
.../iterator/sequencefile/PathFilters.java | 81 +
.../common/iterator/sequencefile/PathType.java | 27 +
.../sequencefile/SequenceFileDirIterable.java | 84 +
.../sequencefile/SequenceFileDirIterator.java | 136 +
.../SequenceFileDirValueIterable.java | 83 +
.../SequenceFileDirValueIterator.java | 159 +
.../sequencefile/SequenceFileIterable.java | 68 +
.../sequencefile/SequenceFileIterator.java | 118 +
.../sequencefile/SequenceFileValueIterable.java | 67 +
.../sequencefile/SequenceFileValueIterator.java | 97 +
.../mahout/common/lucene/AnalyzerUtils.java | 61 +
.../common/lucene/IteratorTokenStream.java | 45 +
.../common/lucene/TokenStreamIterator.java | 57 +
.../common/mapreduce/MergeVectorsCombiner.java | 34 +
.../common/mapreduce/MergeVectorsReducer.java | 40 +
.../common/mapreduce/TransposeMapper.java | 49 +
.../common/mapreduce/VectorSumCombiner.java | 38 +
.../common/mapreduce/VectorSumReducer.java | 35 +
.../org/apache/mahout/common/nlp/NGrams.java | 94 +
.../common/parameters/AbstractParameter.java | 120 +
.../common/parameters/ClassParameter.java | 44 +
.../common/parameters/DoubleParameter.java | 33 +
.../mahout/common/parameters/Parameter.java | 62 +
.../mahout/common/parameters/Parametered.java | 206 +
.../mahout/common/parameters/PathParameter.java | 33 +
.../org/apache/mahout/driver/MahoutDriver.java | 244 +
.../apache/mahout/ep/EvolutionaryProcess.java | 229 +
.../main/java/org/apache/mahout/ep/Mapping.java | 206 +
.../main/java/org/apache/mahout/ep/Payload.java | 36 +
.../main/java/org/apache/mahout/ep/State.java | 302 +
.../java/org/apache/mahout/ep/package-info.java | 26 +
.../mahout/math/DistributedRowMatrixWriter.java | 47 +
.../org/apache/mahout/math/MatrixUtils.java | 114 +
.../mahout/math/MultiLabelVectorWritable.java | 88 +
.../math/als/AlternatingLeastSquaresSolver.java | 116 +
...itFeedbackAlternatingLeastSquaresSolver.java | 171 +
.../math/decomposer/AsyncEigenVerifier.java | 80 +
.../mahout/math/decomposer/EigenStatus.java | 50 +
.../math/decomposer/SimpleEigenVerifier.java | 41 +
.../math/decomposer/SingularVectorVerifier.java | 25 +
.../math/decomposer/hebbian/EigenUpdater.java | 25 +
.../math/decomposer/hebbian/HebbianSolver.java | 342 +
.../math/decomposer/hebbian/HebbianUpdater.java | 71 +
.../math/decomposer/hebbian/TrainingState.java | 143 +
.../math/decomposer/lanczos/LanczosSolver.java | 213 +
.../math/decomposer/lanczos/LanczosState.java | 107 +
.../math/hadoop/DistributedRowMatrix.java | 390 +
.../math/hadoop/MatrixColumnMeansJob.java | 236 +
.../math/hadoop/MatrixMultiplicationJob.java | 177 +
.../mahout/math/hadoop/TimesSquaredJob.java | 251 +
.../apache/mahout/math/hadoop/TransposeJob.java | 85 +
.../decomposer/DistributedLanczosSolver.java | 299 +
.../math/hadoop/decomposer/EigenVector.java | 76 +
.../hadoop/decomposer/EigenVerificationJob.java | 333 +
.../decomposer/HdfsBackedLanczosState.java | 237 +
.../math/hadoop/similarity/SeedVectorUtil.java | 104 +
.../VectorDistanceInvertedMapper.java | 71 +
.../hadoop/similarity/VectorDistanceMapper.java | 80 +
.../similarity/VectorDistanceSimilarityJob.java | 153 +
.../similarity/cooccurrence/MutableElement.java | 50 +
.../cooccurrence/RowSimilarityJob.java | 562 +
.../cooccurrence/TopElementsQueue.java | 59 +
.../hadoop/similarity/cooccurrence/Vectors.java | 199 +
.../measures/CityBlockSimilarity.java | 26 +
.../measures/CooccurrenceCountSimilarity.java | 32 +
.../cooccurrence/measures/CosineSimilarity.java | 50 +
.../measures/CountbasedMeasure.java | 44 +
.../measures/EuclideanDistanceSimilarity.java | 57 +
.../measures/LoglikelihoodSimilarity.java | 34 +
.../measures/PearsonCorrelationSimilarity.java | 37 +
.../measures/TanimotoCoefficientSimilarity.java | 34 +
.../measures/VectorSimilarityMeasure.java | 32 +
.../measures/VectorSimilarityMeasures.java | 46 +
.../DistributedConjugateGradientSolver.java | 172 +
.../mahout/math/hadoop/stats/BasicStats.java | 148 +
.../StandardDeviationCalculatorMapper.java | 55 +
.../StandardDeviationCalculatorReducer.java | 37 +
.../math/hadoop/stats/VarianceTotals.java | 68 +
.../hadoop/stochasticsvd/ABtDenseOutJob.java | 585 +
.../math/hadoop/stochasticsvd/ABtJob.java | 494 +
.../mahout/math/hadoop/stochasticsvd/BtJob.java | 628 +
.../stochasticsvd/DenseBlockWritable.java | 83 +
.../mahout/math/hadoop/stochasticsvd/Omega.java | 257 +
.../mahout/math/hadoop/stochasticsvd/QJob.java | 237 +
.../math/hadoop/stochasticsvd/SSVDCli.java | 201 +
.../math/hadoop/stochasticsvd/SSVDHelper.java | 322 +
.../math/hadoop/stochasticsvd/SSVDSolver.java | 662 +
.../SparseRowBlockAccumulator.java | 90 +
.../stochasticsvd/SparseRowBlockWritable.java | 159 +
.../stochasticsvd/SplitPartitionedWritable.java | 151 +
.../mahout/math/hadoop/stochasticsvd/UJob.java | 170 +
.../mahout/math/hadoop/stochasticsvd/VJob.java | 224 +
.../math/hadoop/stochasticsvd/YtYJob.java | 220 +
.../stochasticsvd/qr/GivensThinSolver.java | 643 +
.../hadoop/stochasticsvd/qr/GramSchmidt.java | 52 +
.../hadoop/stochasticsvd/qr/QRFirstStep.java | 284 +
.../hadoop/stochasticsvd/qr/QRLastStep.java | 144 +
.../mahout/math/neighborhood/BruteSearch.java | 186 +
.../math/neighborhood/FastProjectionSearch.java | 326 +
.../mahout/math/neighborhood/HashedVector.java | 103 +
.../LocalitySensitiveHashSearch.java | 295 +
.../math/neighborhood/ProjectionSearch.java | 233 +
.../mahout/math/neighborhood/Searcher.java | 155 +
.../math/neighborhood/UpdatableSearcher.java | 37 +
.../math/random/AbstractSamplerFunction.java | 39 +
.../mahout/math/random/ChineseRestaurant.java | 111 +
.../apache/mahout/math/random/Empirical.java | 124 +
.../apache/mahout/math/random/IndianBuffet.java | 157 +
.../org/apache/mahout/math/random/Missing.java | 59 +
.../apache/mahout/math/random/MultiNormal.java | 118 +
.../apache/mahout/math/random/Multinomial.java | 202 +
.../org/apache/mahout/math/random/Normal.java | 40 +
.../mahout/math/random/PoissonSampler.java | 67 +
.../mahout/math/random/RandomProjector.java | 133 +
.../org/apache/mahout/math/random/Sampler.java | 25 +
.../mahout/math/random/WeightedThing.java | 71 +
.../mahout/math/ssvd/SequentialBigSvd.java | 69 +
.../math/ssvd/SequentialOutOfCoreSvd.java | 233 +
.../mahout/math/stats/GlobalOnlineAuc.java | 168 +
.../mahout/math/stats/GroupedOnlineAuc.java | 113 +
.../org/apache/mahout/math/stats/OnlineAuc.java | 38 +
.../mahout/math/stats/OnlineSummarizer.java | 93 +
.../org/apache/mahout/math/stats/Sampler.java | 79 +
.../mahout/vectorizer/DictionaryVectorizer.java | 422 +
.../mahout/vectorizer/DocumentProcessor.java | 99 +
.../EncodedVectorsFromSequenceFiles.java | 104 +
.../mahout/vectorizer/EncodingMapper.java | 92 +
.../mahout/vectorizer/HighDFWordsPruner.java | 147 +
.../SimpleTextEncodingVectorizer.java | 72 +
.../SparseVectorsFromSequenceFiles.java | 369 +
.../java/org/apache/mahout/vectorizer/TF.java | 30 +
.../org/apache/mahout/vectorizer/TFIDF.java | 31 +
.../apache/mahout/vectorizer/Vectorizer.java | 29 +
.../mahout/vectorizer/VectorizerConfig.java | 179 +
.../org/apache/mahout/vectorizer/Weight.java | 32 +
.../collocations/llr/CollocCombiner.java | 46 +
.../collocations/llr/CollocDriver.java | 284 +
.../collocations/llr/CollocMapper.java | 178 +
.../collocations/llr/CollocReducer.java | 176 +
.../vectorizer/collocations/llr/Gram.java | 239 +
.../vectorizer/collocations/llr/GramKey.java | 133 +
.../llr/GramKeyGroupComparator.java | 43 +
.../collocations/llr/GramKeyPartitioner.java | 40 +
.../vectorizer/collocations/llr/LLRReducer.java | 170 +
.../common/PartialVectorMergeReducer.java | 89 +
.../vectorizer/common/PartialVectorMerger.java | 144 +
.../document/SequenceFileTokenizerMapper.java | 70 +
.../encoders/AdaptiveWordValueEncoder.java | 69 +
.../encoders/CachingContinuousValueEncoder.java | 64 +
.../encoders/CachingStaticWordValueEncoder.java | 66 +
.../encoders/CachingTextValueEncoder.java | 25 +
.../encoders/CachingValueEncoder.java | 64 +
.../encoders/ConstantValueEncoder.java | 57 +
.../encoders/ContinuousValueEncoder.java | 76 +
.../mahout/vectorizer/encoders/Dictionary.java | 54 +
.../encoders/FeatureVectorEncoder.java | 279 +
.../encoders/InteractionValueEncoder.java | 126 +
.../encoders/LuceneTextValueEncoder.java | 129 +
.../encoders/StaticWordValueEncoder.java | 80 +
.../vectorizer/encoders/TextValueEncoder.java | 142 +
.../vectorizer/encoders/WordValueEncoder.java | 81 +
.../pruner/PrunedPartialVectorMergeReducer.java | 65 +
.../vectorizer/pruner/WordsPrunerReducer.java | 86 +
.../vectorizer/term/TFPartialVectorReducer.java | 139 +
.../vectorizer/term/TermCountCombiner.java | 41 +
.../mahout/vectorizer/term/TermCountMapper.java | 58 +
.../vectorizer/term/TermCountReducer.java | 55 +
.../term/TermDocumentCountMapper.java | 50 +
.../term/TermDocumentCountReducer.java | 41 +
.../mahout/vectorizer/tfidf/TFIDFConverter.java | 361 +
.../tfidf/TFIDFPartialVectorReducer.java | 114 +
.../src/main/resources/supplemental-models.xml | 279 +
.../mahout-mr/mr/src/main/resources/version | 1 +
.../mahout/cf/taste/common/CommonTest.java | 60 +
.../cf/taste/hadoop/TasteHadoopUtilsTest.java | 40 +
.../cf/taste/hadoop/TopItemsQueueTest.java | 72 +
.../als/ParallelALSFactorizationJobTest.java | 379 +
.../cf/taste/hadoop/item/IDReaderTest.java | 66 +
.../taste/hadoop/item/RecommenderJobTest.java | 928 +
.../hadoop/item/ToUserVectorsReducerTest.java | 74 +
.../similarity/item/ItemSimilarityJobTest.java | 269 +
.../mahout/cf/taste/impl/TasteTestCase.java | 98 +
.../mahout/cf/taste/impl/common/BitSetTest.java | 74 +
.../mahout/cf/taste/impl/common/CacheTest.java | 61 +
.../cf/taste/impl/common/FastByIDMapTest.java | 147 +
.../cf/taste/impl/common/FastIDSetTest.java | 162 +
.../cf/taste/impl/common/FastMapTest.java | 228 +
.../impl/common/InvertedRunningAverageTest.java | 88 +
.../common/LongPrimitiveArrayIteratorTest.java | 56 +
.../cf/taste/impl/common/MockRefreshable.java | 45 +
.../cf/taste/impl/common/RefreshHelperTest.java | 70 +
.../common/RunningAverageAndStdDevTest.java | 107 +
.../taste/impl/common/RunningAverageTest.java | 75 +
.../SamplingLongPrimitiveIteratorTest.java | 91 +
.../impl/common/WeightedRunningAverageTest.java | 85 +
...ericRecommenderIRStatsEvaluatorImplTest.java | 73 +
.../taste/impl/eval/LoadEvaluationRunner.java | 68 +
.../model/BooleanItemPreferenceArrayTest.java | 89 +
.../model/BooleanUserPreferenceArrayTest.java | 89 +
.../taste/impl/model/GenericDataModelTest.java | 51 +
.../model/GenericItemPreferenceArrayTest.java | 110 +
.../model/GenericUserPreferenceArrayTest.java | 110 +
.../taste/impl/model/MemoryIDMigratorTest.java | 57 +
...lusAnonymousConcurrentUserDataModelTest.java | 313 +
.../impl/model/file/FileDataModelTest.java | 216 +
.../impl/model/file/FileIDMigratorTest.java | 103 +
.../impl/neighborhood/DummySimilarity.java | 68 +
.../neighborhood/NearestNNeighborhoodTest.java | 53 +
.../neighborhood/ThresholdNeighborhoodTest.java | 51 +
...lUnknownItemsCandidateItemsStrategyTest.java | 65 +
.../recommender/CachingRecommenderTest.java | 78 +
.../GenericItemBasedRecommenderTest.java | 324 +
.../GenericUserBasedRecommenderTest.java | 174 +
.../recommender/ItemAverageRecommenderTest.java | 43 +
.../ItemUserAverageRecommenderTest.java | 43 +
.../taste/impl/recommender/MockRecommender.java | 89 +
.../impl/recommender/NullRescorerTest.java | 47 +
...sNeighborhoodCandidateItemsStrategyTest.java | 75 +
.../impl/recommender/RandomRecommenderTest.java | 41 +
.../impl/recommender/ReversingRescorer.java | 46 +
.../SamplingCandidateItemsStrategyTest.java | 71 +
.../cf/taste/impl/recommender/TopItemsTest.java | 158 +
.../recommender/svd/ALSWRFactorizerTest.java | 208 +
.../svd/FilePersistenceStrategyTest.java | 53 +
.../svd/ParallelSGDFactorizerTest.java | 355 +
.../recommender/svd/SVDRecommenderTest.java | 86 +
.../AveragingPreferenceInferrerTest.java | 37 +
.../EuclideanDistanceSimilarityTest.java | 236 +
.../similarity/GenericItemSimilarityTest.java | 104 +
.../similarity/LogLikelihoodSimilarityTest.java | 80 +
.../PearsonCorrelationSimilarityTest.java | 265 +
.../impl/similarity/SimilarityTestCase.java | 35 +
.../SpearmanCorrelationSimilarityTest.java | 80 +
.../TanimotoCoefficientSimilarityTest.java | 121 +
.../similarity/file/FileItemSimilarityTest.java | 142 +
.../MultithreadedBatchItemSimilaritiesTest.java | 98 +
.../similarity/precompute/SimilarItemsTest.java | 50 +
.../mahout/classifier/ClassifierData.java | 102 +
.../mahout/classifier/ConfusionMatrixTest.java | 119 +
.../RegressionResultAnalyzerTest.java | 128 +
.../classifier/df/DecisionForestTest.java | 206 +
.../df/builder/DecisionTreeBuilderTest.java | 78 +
.../df/builder/DefaultTreeBuilderTest.java | 74 +
.../df/builder/InfiniteRecursionTest.java | 60 +
.../classifier/df/data/DataConverterTest.java | 60 +
.../classifier/df/data/DataLoaderTest.java | 350 +
.../mahout/classifier/df/data/DataTest.java | 396 +
.../mahout/classifier/df/data/DatasetTest.java | 72 +
.../classifier/df/data/DescriptorUtilsTest.java | 92 +
.../apache/mahout/classifier/df/data/Utils.java | 284 +
.../mapreduce/inmem/InMemInputFormatTest.java | 109 +
.../df/mapreduce/inmem/InMemInputSplitTest.java | 77 +
.../mapreduce/partial/PartialBuilderTest.java | 197 +
.../df/mapreduce/partial/Step1MapperTest.java | 160 +
.../df/mapreduce/partial/TreeIDTest.java | 48 +
.../mahout/classifier/df/node/NodeTest.java | 108 +
.../classifier/df/split/DefaultIgSplitTest.java | 78 +
.../df/split/RegressionSplitTest.java | 87 +
.../classifier/df/tools/VisualizerTest.java | 211 +
.../mahout/classifier/evaluation/AucTest.java | 86 +
.../ComplementaryNaiveBayesClassifierTest.java | 47 +
.../naivebayes/NaiveBayesModelTest.java | 36 +
.../classifier/naivebayes/NaiveBayesTest.java | 135 +
.../naivebayes/NaiveBayesTestBase.java | 135 +
.../StandardNaiveBayesClassifierTest.java | 47 +
.../training/IndexInstancesMapperTest.java | 85 +
.../naivebayes/training/ThetaMapperTest.java | 61 +
.../naivebayes/training/WeightsMapperTest.java | 60 +
.../sequencelearning/hmm/HMMAlgorithmsTest.java | 164 +
.../sequencelearning/hmm/HMMEvaluatorTest.java | 63 +
.../sequencelearning/hmm/HMMModelTest.java | 32 +
.../sequencelearning/hmm/HMMTestBase.java | 73 +
.../sequencelearning/hmm/HMMTrainerTest.java | 163 +
.../sequencelearning/hmm/HMMUtilsTest.java | 161 +
.../sgd/AdaptiveLogisticRegressionTest.java | 186 +
.../classifier/sgd/CsvRecordFactoryTest.java | 90 +
.../classifier/sgd/GradientMachineTest.java | 41 +
.../classifier/sgd/ModelSerializerTest.java | 162 +
.../mahout/classifier/sgd/OnlineBaseTest.java | 160 +
.../sgd/OnlineLogisticRegressionTest.java | 330 +
.../classifier/sgd/PassiveAggressiveTest.java | 35 +
.../mahout/clustering/ClusteringTestUtils.java | 152 +
.../mahout/clustering/TestClusterInterface.java | 83 +
.../clustering/TestGaussianAccumulators.java | 186 +
.../clustering/canopy/TestCanopyCreation.java | 674 +
.../ClusterClassificationDriverTest.java | 255 +
.../fuzzykmeans/TestFuzzyKmeansClustering.java | 202 +
.../iterator/TestClusterClassifier.java | 238 +
.../clustering/kmeans/TestKmeansClustering.java | 385 +
.../kmeans/TestRandomSeedGenerator.java | 169 +
.../clustering/lda/cvb/TestCVBModelTrainer.java | 138 +
.../spectral/TestAffinityMatrixInputJob.java | 145 +
.../spectral/TestMatrixDiagonalizeJob.java | 116 +
.../spectral/TestUnitVectorizerJob.java | 65 +
.../clustering/spectral/TestVectorCache.java | 110 +
.../TestVectorMatrixMultiplicationJob.java | 75 +
.../spectral/kmeans/TestEigenSeedGenerator.java | 100 +
.../streaming/cluster/BallKMeansTest.java | 196 +
.../clustering/streaming/cluster/DataUtils.java | 92 +
.../streaming/cluster/StreamingKMeansTest.java | 169 +
.../mapreduce/StreamingKMeansTestMR.java | 282 +
.../tools/ResplitSequenceFilesTest.java | 80 +
.../clustering/topdown/PathDirectoryTest.java | 65 +
.../postprocessor/ClusterCountReaderTest.java | 121 +
.../ClusterOutputPostProcessorTest.java | 205 +
.../apache/mahout/common/AbstractJobTest.java | 240 +
.../DistributedCacheFileLocationTest.java | 46 +
.../mahout/common/DummyOutputCollector.java | 57 +
.../apache/mahout/common/DummyRecordWriter.java | 223 +
.../mahout/common/DummyRecordWriterTest.java | 45 +
.../mahout/common/DummyStatusReporter.java | 76 +
.../mahout/common/IntPairWritableTest.java | 114 +
.../apache/mahout/common/MahoutTestCase.java | 148 +
.../org/apache/mahout/common/MockIterator.java | 51 +
.../apache/mahout/common/StringUtilsTest.java | 70 +
.../distance/CosineDistanceMeasureTest.java | 66 +
.../distance/DefaultDistanceMeasureTest.java | 103 +
.../DefaultWeightedDistanceMeasureTest.java | 56 +
.../common/distance/TestChebyshevMeasure.java | 55 +
.../distance/TestEuclideanDistanceMeasure.java | 26 +
.../TestMahalanobisDistanceMeasure.java | 56 +
.../distance/TestManhattanDistanceMeasure.java | 26 +
.../common/distance/TestMinkowskiMeasure.java | 64 +
.../distance/TestTanimotoDistanceMeasure.java | 25 +
...estWeightedEuclideanDistanceMeasureTest.java | 25 +
.../TestWeightedManhattanDistanceMeasure.java | 26 +
.../common/iterator/CountingIteratorTest.java | 44 +
.../mahout/common/iterator/SamplerCase.java | 101 +
.../common/iterator/TestFixedSizeSampler.java | 33 +
.../common/iterator/TestSamplingIterator.java | 77 +
.../iterator/TestStableFixedSizeSampler.java | 33 +
.../mahout/common/lucene/AnalyzerUtilsTest.java | 38 +
.../apache/mahout/driver/MahoutDriverTest.java | 32 +
.../mahout/ep/EvolutionaryProcessTest.java | 81 +
.../apache/mahout/math/MatrixWritableTest.java | 148 +
.../java/org/apache/mahout/math/VarintTest.java | 189 +
.../apache/mahout/math/VectorWritableTest.java | 123 +
.../apache/mahout/math/hadoop/MathHelper.java | 236 +
.../math/hadoop/TestDistributedRowMatrix.java | 395 +
.../TestDistributedLanczosSolver.java | 132 +
.../TestDistributedLanczosSolverCLI.java | 190 +
.../TestVectorDistanceSimilarityJob.java | 238 +
.../cooccurrence/RowSimilarityJobTest.java | 214 +
.../measures/VectorSimilarityMeasuresTest.java | 133 +
.../TestDistributedConjugateGradientSolver.java | 59 +
...stDistributedConjugateGradientSolverCLI.java | 111 +
.../math/hadoop/stats/BasicStatsTest.java | 121 +
.../stochasticsvd/LocalSSVDPCASparseTest.java | 296 +
.../stochasticsvd/LocalSSVDSolverDenseTest.java | 206 +
.../LocalSSVDSolverSparseSequentialTest.java | 209 +
.../hadoop/stochasticsvd/SSVDCommonTest.java | 105 +
.../hadoop/stochasticsvd/SSVDTestsHelper.java | 172 +
.../LocalitySensitiveHashSearchTest.java | 119 +
.../mahout/math/neighborhood/LumpyData.java | 77 +
.../math/neighborhood/SearchQualityTest.java | 178 +
.../math/neighborhood/SearchSanityTest.java | 244 +
.../math/ssvd/SequentialOutOfCoreSvdTest.java | 195 +
.../apache/mahout/math/stats/OnlineAucTest.java | 127 +
.../apache/mahout/math/stats/SamplerTest.java | 45 +
.../vectorizer/DictionaryVectorizerTest.java | 220 +
.../vectorizer/DocumentProcessorTest.java | 81 +
.../EncodedVectorsFromSequenceFilesTest.java | 126 +
.../vectorizer/HighDFWordsPrunerTest.java | 154 +
.../vectorizer/RandomDocumentGenerator.java | 69 +
.../SparseVectorsFromSequenceFilesTest.java | 203 +
.../collocations/llr/CollocMapperTest.java | 180 +
.../collocations/llr/CollocReducerTest.java | 86 +
.../llr/GramKeyGroupComparatorTest.java | 45 +
.../llr/GramKeyPartitionerTest.java | 54 +
.../collocations/llr/GramKeyTest.java | 106 +
.../vectorizer/collocations/llr/GramTest.java | 215 +
.../collocations/llr/LLRReducerTest.java | 116 +
.../vectorizer/encoders/CachingEncoderTest.java | 48 +
.../encoders/ConstantValueEncoderTest.java | 74 +
.../encoders/ContinuousValueEncoderTest.java | 88 +
.../encoders/InteractionValueEncoderTest.java | 103 +
.../encoders/TextValueEncoderTest.java | 99 +
.../encoders/WordLikeValueEncoderTest.java | 99 +
.../mr/src/test/resources/FPGsynth.dat | 193 +
.../mahout-mr/mr/src/test/resources/cancer.csv | 684 +
.../mahout-mr/mr/src/test/resources/iris.csv | 151 +
.../mahout-mr/mr/src/test/resources/retail.dat | 88162 +++++++++++++++++
.../retail_results_with_min_sup_100.dat | 6438 ++
.../mahout-mr/mr/src/test/resources/sgd.csv | 61 +
.../mr/src/test/resources/word-list.txt | 512 +
community/mahout-mr/pom.xml | 259 +-
.../appended-resources/supplemental-models.xml | 279 -
.../src/images/logos/ mahout-powered.svg | 630 -
.../mahout-mr/src/images/logos/favicon.ico | Bin 28838 -> 0 bytes
.../mahout-mr/src/images/logos/favicon128.png | Bin 5259 -> 0 bytes
.../mahout-mr/src/images/logos/favicon16.png | Bin 1009 -> 0 bytes
.../mahout-mr/src/images/logos/favicon32.png | Bin 1847 -> 0 bytes
.../mahout-mr/src/images/logos/favicon64.png | Bin 3148 -> 0 bytes
.../src/images/logos/mahout-logo-100.png | Bin 19477 -> 0 bytes
.../src/images/logos/mahout-logo-200.png | Bin 46360 -> 0 bytes
.../src/images/logos/mahout-logo-300.png | Bin 70139 -> 0 bytes
.../src/images/logos/mahout-logo-400.png | Bin 55468 -> 0 bytes
.../images/logos/mahout-logo-poweredby-100.png | Bin 24623 -> 0 bytes
.../images/logos/mahout-logo-poweredby-55.png | Bin 11684 -> 0 bytes
.../logos/mahout-logo-transparent-400.png | Bin 61970 -> 0 bytes
.../mahout-mr/src/images/logos/mahout-logo.svg | 627 -
community/mahout-mr/src/main/assembly/job.xml | 61 -
community/mahout-mr/src/main/assembly/src.xml | 64 -
.../main/java/org/apache/mahout/Version.java | 41 -
.../cf/taste/common/NoSuchItemException.java | 32 -
.../cf/taste/common/NoSuchUserException.java | 32 -
.../mahout/cf/taste/common/Refreshable.java | 53 -
.../mahout/cf/taste/common/TasteException.java | 41 -
.../mahout/cf/taste/common/Weighting.java | 31 -
.../mahout/cf/taste/eval/DataModelBuilder.java | 45 -
.../mahout/cf/taste/eval/IRStatistics.java | 80 -
.../cf/taste/eval/RecommenderBuilder.java | 45 -
.../cf/taste/eval/RecommenderEvaluator.java | 105 -
.../taste/eval/RecommenderIRStatsEvaluator.java | 64 -
.../taste/eval/RelevantItemsDataSplitter.java | 62 -
.../cf/taste/hadoop/EntityEntityWritable.java | 98 -
.../cf/taste/hadoop/EntityPrefWritable.java | 89 -
.../cf/taste/hadoop/MutableRecommendedItem.java | 81 -
.../taste/hadoop/RecommendedItemsWritable.java | 96 -
.../cf/taste/hadoop/TasteHadoopUtils.java | 84 -
.../cf/taste/hadoop/ToEntityPrefsMapper.java | 78 -
.../cf/taste/hadoop/ToItemPrefsMapper.java | 46 -
.../mahout/cf/taste/hadoop/TopItemsQueue.java | 60 -
.../apache/mahout/cf/taste/hadoop/als/ALS.java | 100 -
.../cf/taste/hadoop/als/DatasetSplitter.java | 158 -
.../hadoop/als/FactorizationEvaluator.java | 166 -
.../hadoop/als/MultithreadedSharingMapper.java | 62 -
.../hadoop/als/ParallelALSFactorizationJob.java | 414 -
.../cf/taste/hadoop/als/PredictionMapper.java | 145 -
.../cf/taste/hadoop/als/RecommenderJob.java | 110 -
.../cf/taste/hadoop/als/SharingMapper.java | 59 -
.../hadoop/als/SolveExplicitFeedbackMapper.java | 61 -
.../hadoop/als/SolveImplicitFeedbackMapper.java | 58 -
.../item/AggregateAndRecommendReducer.java | 220 -
.../mahout/cf/taste/hadoop/item/IDReader.java | 244 -
.../item/ItemFilterAsVectorAndPrefsReducer.java | 62 -
.../cf/taste/hadoop/item/ItemFilterMapper.java | 47 -
.../cf/taste/hadoop/item/ItemIDIndexMapper.java | 56 -
.../taste/hadoop/item/ItemIDIndexReducer.java | 48 -
.../hadoop/item/PartialMultiplyMapper.java | 57 -
.../item/PrefAndSimilarityColumnWritable.java | 85 -
.../cf/taste/hadoop/item/RecommenderJob.java | 337 -
.../item/SimilarityMatrixRowWrapperMapper.java | 54 -
.../taste/hadoop/item/ToUserVectorsReducer.java | 84 -
.../hadoop/item/ToVectorAndPrefReducer.java | 63 -
.../hadoop/item/UserVectorSplitterMapper.java | 116 -
.../hadoop/item/VectorAndPrefsWritable.java | 92 -
.../taste/hadoop/item/VectorOrPrefWritable.java | 104 -
.../preparation/PreparePreferenceMatrixJob.java | 115 -
.../hadoop/preparation/ToItemVectorsMapper.java | 56 -
.../preparation/ToItemVectorsReducer.java | 38 -
.../similarity/item/ItemSimilarityJob.java | 233 -
.../similarity/item/TopSimilarItemsQueue.java | 60 -
.../common/AbstractLongPrimitiveIterator.java | 27 -
.../mahout/cf/taste/impl/common/BitSet.java | 93 -
.../mahout/cf/taste/impl/common/Cache.java | 178 -
.../cf/taste/impl/common/FastByIDMap.java | 661 -
.../mahout/cf/taste/impl/common/FastIDSet.java | 426 -
.../mahout/cf/taste/impl/common/FastMap.java | 729 -
.../taste/impl/common/FixedRunningAverage.java | 83 -
.../common/FixedRunningAverageAndStdDev.java | 51 -
.../taste/impl/common/FullRunningAverage.java | 109 -
.../common/FullRunningAverageAndStdDev.java | 107 -
.../impl/common/InvertedRunningAverage.java | 58 -
.../common/InvertedRunningAverageAndStdDev.java | 63 -
.../impl/common/LongPrimitiveArrayIterator.java | 93 -
.../impl/common/LongPrimitiveIterator.java | 39 -
.../cf/taste/impl/common/RefreshHelper.java | 122 -
.../mahout/cf/taste/impl/common/Retriever.java | 36 -
.../cf/taste/impl/common/RunningAverage.java | 67 -
.../impl/common/RunningAverageAndStdDev.java | 36 -
.../common/SamplingLongPrimitiveIterator.java | 111 -
.../cf/taste/impl/common/SkippingIterator.java | 35 -
.../impl/common/WeightedRunningAverage.java | 100 -
.../common/WeightedRunningAverageAndStdDev.java | 89 -
.../impl/common/jdbc/AbstractJDBCComponent.java | 88 -
.../taste/impl/common/jdbc/EachRowIterator.java | 92 -
.../impl/common/jdbc/ResultSetIterator.java | 66 -
.../AbstractDifferenceRecommenderEvaluator.java | 276 -
...eAbsoluteDifferenceRecommenderEvaluator.java | 59 -
.../GenericRecommenderIRStatsEvaluator.java | 237 -
.../eval/GenericRelevantItemsDataSplitter.java | 83 -
.../cf/taste/impl/eval/IRStatisticsImpl.java | 95 -
.../mahout/cf/taste/impl/eval/LoadCallable.java | 40 -
.../cf/taste/impl/eval/LoadEvaluator.java | 61 -
.../cf/taste/impl/eval/LoadStatistics.java | 34 -
.../eval/OrderBasedRecommenderEvaluator.java | 431 -
.../impl/eval/RMSRecommenderEvaluator.java | 56 -
.../cf/taste/impl/eval/StatsCallable.java | 64 -
.../cf/taste/impl/model/AbstractDataModel.java | 53 -
.../cf/taste/impl/model/AbstractIDMigrator.java | 66 -
.../impl/model/AbstractJDBCIDMigrator.java | 108 -
.../impl/model/BooleanItemPreferenceArray.java | 234 -
.../cf/taste/impl/model/BooleanPreference.java | 64 -
.../impl/model/BooleanUserPreferenceArray.java | 234 -
.../impl/model/GenericBooleanPrefDataModel.java | 320 -
.../cf/taste/impl/model/GenericDataModel.java | 361 -
.../impl/model/GenericItemPreferenceArray.java | 301 -
.../cf/taste/impl/model/GenericPreference.java | 70 -
.../impl/model/GenericUserPreferenceArray.java | 307 -
.../cf/taste/impl/model/MemoryIDMigrator.java | 55 -
.../taste/impl/model/MySQLJDBCIDMigrator.java | 67 -
.../PlusAnonymousConcurrentUserDataModel.java | 352 -
.../impl/model/PlusAnonymousUserDataModel.java | 320 -
.../PlusAnonymousUserLongPrimitiveIterator.java | 90 -
.../cf/taste/impl/model/file/FileDataModel.java | 758 -
.../taste/impl/model/file/FileIDMigrator.java | 117 -
.../neighborhood/AbstractUserNeighborhood.java | 71 -
.../neighborhood/CachingUserNeighborhood.java | 69 -
.../neighborhood/NearestNUserNeighborhood.java | 122 -
.../neighborhood/ThresholdUserNeighborhood.java | 104 -
.../AbstractCandidateItemsStrategy.java | 57 -
.../impl/recommender/AbstractRecommender.java | 140 -
.../AllSimilarItemsCandidateItemsStrategy.java | 50 -
.../AllUnknownItemsCandidateItemsStrategy.java | 41 -
.../impl/recommender/ByRescoreComparator.java | 65 -
.../ByValueRecommendedItemComparator.java | 43 -
.../impl/recommender/CachingRecommender.java | 251 -
.../recommender/EstimatedPreferenceCapper.java | 46 -
.../GenericBooleanPrefItemBasedRecommender.java | 71 -
.../GenericBooleanPrefUserBasedRecommender.java | 82 -
.../GenericItemBasedRecommender.java | 378 -
.../recommender/GenericRecommendedItem.java | 76 -
.../GenericUserBasedRecommender.java | 247 -
.../recommender/ItemAverageRecommender.java | 199 -
.../recommender/ItemUserAverageRecommender.java | 240 -
.../cf/taste/impl/recommender/NullRescorer.java | 86 -
...ItemsNeighborhoodCandidateItemsStrategy.java | 48 -
.../impl/recommender/RandomRecommender.java | 97 -
.../SamplingCandidateItemsStrategy.java | 165 -
.../cf/taste/impl/recommender/SimilarUser.java | 80 -
.../cf/taste/impl/recommender/TopItems.java | 211 -
.../impl/recommender/svd/ALSWRFactorizer.java | 312 -
.../recommender/svd/AbstractFactorizer.java | 94 -
.../impl/recommender/svd/Factorization.java | 137 -
.../taste/impl/recommender/svd/Factorizer.java | 30 -
.../svd/FilePersistenceStrategy.java | 139 -
.../recommender/svd/NoPersistenceStrategy.java | 37 -
.../recommender/svd/ParallelSGDFactorizer.java | 340 -
.../recommender/svd/PersistenceStrategy.java | 46 -
.../recommender/svd/RatingSGDFactorizer.java | 221 -
.../recommender/svd/SVDPlusPlusFactorizer.java | 178 -
.../impl/recommender/svd/SVDPreference.java | 41 -
.../impl/recommender/svd/SVDRecommender.java | 185 -
.../impl/similarity/AbstractItemSimilarity.java | 64 -
.../impl/similarity/AbstractSimilarity.java | 343 -
.../similarity/AveragingPreferenceInferrer.java | 85 -
.../impl/similarity/CachingItemSimilarity.java | 111 -
.../impl/similarity/CachingUserSimilarity.java | 104 -
.../impl/similarity/CityBlockSimilarity.java | 98 -
.../similarity/EuclideanDistanceSimilarity.java | 67 -
.../impl/similarity/GenericItemSimilarity.java | 358 -
.../impl/similarity/GenericUserSimilarity.java | 238 -
.../similarity/LogLikelihoodSimilarity.java | 121 -
.../impl/similarity/LongPairMatchPredicate.java | 40 -
.../PearsonCorrelationSimilarity.java | 93 -
.../SpearmanCorrelationSimilarity.java | 135 -
.../TanimotoCoefficientSimilarity.java | 126 -
.../similarity/UncenteredCosineSimilarity.java | 69 -
.../file/FileItemItemSimilarityIterable.java | 46 -
.../file/FileItemItemSimilarityIterator.java | 60 -
.../similarity/file/FileItemSimilarity.java | 137 -
.../precompute/FileSimilarItemsWriter.java | 67 -
.../MultithreadedBatchItemSimilarities.java | 230 -
.../apache/mahout/cf/taste/model/DataModel.java | 199 -
.../mahout/cf/taste/model/IDMigrator.java | 63 -
.../mahout/cf/taste/model/JDBCDataModel.java | 43 -
.../mahout/cf/taste/model/Preference.java | 48 -
.../mahout/cf/taste/model/PreferenceArray.java | 143 -
.../cf/taste/model/UpdatableIDMigrator.java | 47 -
.../cf/taste/neighborhood/UserNeighborhood.java | 40 -
.../recommender/CandidateItemsStrategy.java | 37 -
.../mahout/cf/taste/recommender/IDRescorer.java | 47 -
.../taste/recommender/ItemBasedRecommender.java | 145 -
.../MostSimilarItemsCandidateItemsStrategy.java | 31 -
.../cf/taste/recommender/RecommendedItem.java | 41 -
.../cf/taste/recommender/Recommender.java | 132 -
.../mahout/cf/taste/recommender/Rescorer.java | 52 -
.../taste/recommender/UserBasedRecommender.java | 54 -
.../cf/taste/similarity/ItemSimilarity.java | 64 -
.../cf/taste/similarity/PreferenceInferrer.java | 47 -
.../cf/taste/similarity/UserSimilarity.java | 58 -
.../precompute/BatchItemSimilarities.java | 56 -
.../similarity/precompute/SimilarItem.java | 56 -
.../similarity/precompute/SimilarItems.java | 84 -
.../precompute/SimilarItemsWriter.java | 33 -
.../classifier/AbstractVectorClassifier.java | 248 -
.../mahout/classifier/ClassifierResult.java | 74 -
.../mahout/classifier/ConfusionMatrix.java | 444 -
.../apache/mahout/classifier/OnlineLearner.java | 96 -
.../classifier/RegressionResultAnalyzer.java | 144 -
.../mahout/classifier/ResultAnalyzer.java | 132 -
.../apache/mahout/classifier/df/Bagging.java | 61 -
.../apache/mahout/classifier/df/DFUtils.java | 174 -
.../mahout/classifier/df/DecisionForest.java | 241 -
.../mahout/classifier/df/ErrorEstimate.java | 51 -
.../df/builder/DecisionTreeBuilder.java | 422 -
.../df/builder/DefaultTreeBuilder.java | 253 -
.../classifier/df/builder/TreeBuilder.java | 42 -
.../apache/mahout/classifier/df/data/Data.java | 281 -
.../classifier/df/data/DataConverter.java | 72 -
.../mahout/classifier/df/data/DataLoader.java | 255 -
.../mahout/classifier/df/data/DataUtils.java | 89 -
.../mahout/classifier/df/data/Dataset.java | 422 -
.../classifier/df/data/DescriptorException.java | 28 -
.../classifier/df/data/DescriptorUtils.java | 110 -
.../mahout/classifier/df/data/Instance.java | 75 -
.../df/data/conditions/Condition.java | 57 -
.../classifier/df/data/conditions/Equals.java | 42 -
.../df/data/conditions/GreaterOrEquals.java | 42 -
.../classifier/df/data/conditions/Lesser.java | 42 -
.../mahout/classifier/df/mapreduce/Builder.java | 333 -
.../classifier/df/mapreduce/Classifier.java | 238 -
.../classifier/df/mapreduce/MapredMapper.java | 75 -
.../classifier/df/mapreduce/MapredOutput.java | 120 -
.../df/mapreduce/inmem/InMemBuilder.java | 114 -
.../df/mapreduce/inmem/InMemInputFormat.java | 284 -
.../df/mapreduce/inmem/InMemMapper.java | 106 -
.../df/mapreduce/inmem/package-info.java | 22 -
.../df/mapreduce/partial/PartialBuilder.java | 158 -
.../df/mapreduce/partial/Step1Mapper.java | 168 -
.../classifier/df/mapreduce/partial/TreeID.java | 58 -
.../df/mapreduce/partial/package-info.java | 16 -
.../classifier/df/node/CategoricalNode.java | 134 -
.../apache/mahout/classifier/df/node/Leaf.java | 95 -
.../apache/mahout/classifier/df/node/Node.java | 96 -
.../classifier/df/node/NumericalNode.java | 115 -
.../classifier/df/ref/SequentialBuilder.java | 78 -
.../classifier/df/split/DefaultIgSplit.java | 118 -
.../mahout/classifier/df/split/IgSplit.java | 35 -
.../mahout/classifier/df/split/OptIgSplit.java | 232 -
.../classifier/df/split/RegressionSplit.java | 177 -
.../mahout/classifier/df/split/Split.java | 68 -
.../mahout/classifier/df/tools/Describe.java | 166 -
.../classifier/df/tools/ForestVisualizer.java | 158 -
.../mahout/classifier/df/tools/Frequencies.java | 122 -
.../classifier/df/tools/FrequenciesJob.java | 297 -
.../classifier/df/tools/TreeVisualizer.java | 264 -
.../mahout/classifier/df/tools/UDistrib.java | 212 -
.../mahout/classifier/evaluation/Auc.java | 233 -
.../AbstractNaiveBayesClassifier.java | 82 -
.../classifier/naivebayes/BayesUtils.java | 161 -
.../ComplementaryNaiveBayesClassifier.java | 43 -
.../classifier/naivebayes/NaiveBayesModel.java | 170 -
.../StandardNaiveBayesClassifier.java | 40 -
.../naivebayes/test/BayesTestMapper.java | 76 -
.../naivebayes/test/TestNaiveBayesDriver.java | 176 -
.../training/ComplementaryThetaTrainer.java | 83 -
.../training/IndexInstancesMapper.java | 53 -
.../naivebayes/training/ThetaMapper.java | 61 -
.../naivebayes/training/TrainNaiveBayesJob.java | 177 -
.../naivebayes/training/WeightsMapper.java | 68 -
.../sequencelearning/hmm/BaumWelchTrainer.java | 161 -
.../sequencelearning/hmm/HmmAlgorithms.java | 306 -
.../sequencelearning/hmm/HmmEvaluator.java | 194 -
.../sequencelearning/hmm/HmmModel.java | 383 -
.../sequencelearning/hmm/HmmTrainer.java | 488 -
.../sequencelearning/hmm/HmmUtils.java | 360 -
.../hmm/LossyHmmSerializer.java | 62 -
.../hmm/RandomSequenceGenerator.java | 102 -
.../sequencelearning/hmm/ViterbiEvaluator.java | 122 -
.../sgd/AbstractOnlineLogisticRegression.java | 317 -
.../sgd/AdaptiveLogisticRegression.java | 586 -
.../mahout/classifier/sgd/CrossFoldLearner.java | 334 -
.../mahout/classifier/sgd/CsvRecordFactory.java | 395 -
.../mahout/classifier/sgd/DefaultGradient.java | 49 -
.../mahout/classifier/sgd/ElasticBandPrior.java | 76 -
.../apache/mahout/classifier/sgd/Gradient.java | 30 -
.../mahout/classifier/sgd/GradientMachine.java | 405 -
.../org/apache/mahout/classifier/sgd/L1.java | 59 -
.../org/apache/mahout/classifier/sgd/L2.java | 66 -
.../mahout/classifier/sgd/MixedGradient.java | 66 -
.../mahout/classifier/sgd/ModelDissector.java | 232 -
.../mahout/classifier/sgd/ModelSerializer.java | 67 -
.../sgd/OnlineLogisticRegression.java | 172 -
.../classifier/sgd/PassiveAggressive.java | 204 -
.../classifier/sgd/PolymorphicWritable.java | 46 -
.../mahout/classifier/sgd/PriorFunction.java | 45 -
.../mahout/classifier/sgd/RankingGradient.java | 85 -
.../mahout/classifier/sgd/RecordFactory.java | 47 -
.../apache/mahout/classifier/sgd/TPrior.java | 61 -
.../mahout/classifier/sgd/UniformPrior.java | 47 -
.../mahout/classifier/sgd/package-info.java | 23 -
.../mahout/clustering/AbstractCluster.java | 390 -
.../org/apache/mahout/clustering/Cluster.java | 90 -
.../mahout/clustering/ClusteringUtils.java | 306 -
.../mahout/clustering/GaussianAccumulator.java | 62 -
.../org/apache/mahout/clustering/Model.java | 93 -
.../mahout/clustering/ModelDistribution.java | 41 -
.../clustering/OnlineGaussianAccumulator.java | 107 -
.../RunningSumsGaussianAccumulator.java | 90 -
.../clustering/UncommonDistributions.java | 136 -
.../apache/mahout/clustering/canopy/Canopy.java | 60 -
.../clustering/canopy/CanopyClusterer.java | 220 -
.../clustering/canopy/CanopyConfigKeys.java | 70 -
.../mahout/clustering/canopy/CanopyDriver.java | 379 -
.../mahout/clustering/canopy/CanopyMapper.java | 66 -
.../mahout/clustering/canopy/CanopyReducer.java | 70 -
.../ClusterClassificationConfigKeys.java | 33 -
.../classify/ClusterClassificationDriver.java | 313 -
.../classify/ClusterClassificationMapper.java | 161 -
.../clustering/classify/ClusterClassifier.java | 231 -
.../WeightedPropertyVectorWritable.java | 95 -
.../classify/WeightedVectorWritable.java | 72 -
.../fuzzykmeans/FuzzyKMeansClusterer.java | 59 -
.../fuzzykmeans/FuzzyKMeansDriver.java | 324 -
.../clustering/fuzzykmeans/FuzzyKMeansUtil.java | 76 -
.../clustering/fuzzykmeans/SoftCluster.java | 60 -
.../iterator/AbstractClusteringPolicy.java | 72 -
.../mahout/clustering/iterator/CIMapper.java | 71 -
.../mahout/clustering/iterator/CIReducer.java | 64 -
.../iterator/CanopyClusteringPolicy.java | 52 -
.../clustering/iterator/ClusterIterator.java | 219 -
.../clustering/iterator/ClusterWritable.java | 56 -
.../clustering/iterator/ClusteringPolicy.java | 66 -
.../iterator/ClusteringPolicyWritable.java | 55 -
.../iterator/DistanceMeasureCluster.java | 91 -
.../iterator/FuzzyKMeansClusteringPolicy.java | 90 -
.../iterator/KMeansClusteringPolicy.java | 64 -
.../clustering/kernel/IKernelProfile.java | 27 -
.../kernel/TriangularKernelProfile.java | 27 -
.../mahout/clustering/kmeans/KMeansDriver.java | 257 -
.../mahout/clustering/kmeans/KMeansUtil.java | 74 -
.../mahout/clustering/kmeans/Kluster.java | 117 -
.../clustering/kmeans/RandomSeedGenerator.java | 136 -
.../mahout/clustering/kmeans/package-info.java | 5 -
.../lda/cvb/CVB0DocInferenceMapper.java | 51 -
.../mahout/clustering/lda/cvb/CVB0Driver.java | 536 -
.../CVB0TopicTermVectorNormalizerMapper.java | 38 -
.../clustering/lda/cvb/CachingCVB0Mapper.java | 133 -
.../lda/cvb/CachingCVB0PerplexityMapper.java | 108 -
.../cvb/InMemoryCollapsedVariationalBayes0.java | 492 -
.../mahout/clustering/lda/cvb/ModelTrainer.java | 301 -
.../mahout/clustering/lda/cvb/TopicModel.java | 513 -
.../apache/mahout/clustering/package-info.java | 13 -
.../spectral/AffinityMatrixInputJob.java | 84 -
.../spectral/AffinityMatrixInputMapper.java | 78 -
.../spectral/AffinityMatrixInputReducer.java | 59 -
.../spectral/IntDoublePairWritable.java | 75 -
.../apache/mahout/clustering/spectral/Keys.java | 31 -
.../spectral/MatrixDiagonalizeJob.java | 108 -
.../clustering/spectral/UnitVectorizerJob.java | 79 -
.../mahout/clustering/spectral/VectorCache.java | 116 -
.../spectral/VectorMatrixMultiplicationJob.java | 139 -
.../clustering/spectral/VertexWritable.java | 101 -
.../spectral/kmeans/EigenSeedGenerator.java | 120 -
.../spectral/kmeans/SpectralKMeansDriver.java | 243 -
.../streaming/cluster/BallKMeans.java | 456 -
.../streaming/cluster/StreamingKMeans.java | 368 -
.../streaming/mapreduce/CentroidWritable.java | 88 -
.../mapreduce/StreamingKMeansDriver.java | 493 -
.../mapreduce/StreamingKMeansMapper.java | 102 -
.../mapreduce/StreamingKMeansReducer.java | 109 -
.../mapreduce/StreamingKMeansThread.java | 92 -
.../mapreduce/StreamingKMeansUtilsMR.java | 154 -
.../streaming/tools/ResplitSequenceFiles.java | 149 -
.../clustering/topdown/PathDirectory.java | 94 -
.../postprocessor/ClusterCountReader.java | 103 -
.../ClusterOutputPostProcessor.java | 139 -
.../ClusterOutputPostProcessorDriver.java | 182 -
.../ClusterOutputPostProcessorMapper.java | 58 -
.../ClusterOutputPostProcessorReducer.java | 62 -
.../org/apache/mahout/common/AbstractJob.java | 648 -
.../org/apache/mahout/common/ClassUtils.java | 61 -
.../apache/mahout/common/CommandLineUtil.java | 68 -
.../org/apache/mahout/common/HadoopUtil.java | 435 -
.../apache/mahout/common/IntPairWritable.java | 270 -
.../org/apache/mahout/common/IntegerTuple.java | 176 -
.../java/org/apache/mahout/common/LongPair.java | 80 -
.../org/apache/mahout/common/MemoryUtil.java | 99 -
.../java/org/apache/mahout/common/Pair.java | 99 -
.../org/apache/mahout/common/Parameters.java | 98 -
.../org/apache/mahout/common/StringTuple.java | 177 -
.../org/apache/mahout/common/StringUtils.java | 63 -
.../apache/mahout/common/TimingStatistics.java | 154 -
.../commandline/DefaultOptionCreator.java | 417 -
.../distance/ChebyshevDistanceMeasure.java | 63 -
.../common/distance/CosineDistanceMeasure.java | 119 -
.../mahout/common/distance/DistanceMeasure.java | 48 -
.../distance/EuclideanDistanceMeasure.java | 41 -
.../distance/MahalanobisDistanceMeasure.java | 197 -
.../distance/ManhattanDistanceMeasure.java | 70 -
.../distance/MinkowskiDistanceMeasure.java | 93 -
.../SquaredEuclideanDistanceMeasure.java | 59 -
.../distance/TanimotoDistanceMeasure.java | 69 -
.../distance/WeightedDistanceMeasure.java | 93 -
.../WeightedEuclideanDistanceMeasure.java | 51 -
.../WeightedManhattanDistanceMeasure.java | 53 -
.../iterator/CopyConstructorIterator.java | 64 -
.../common/iterator/CountingIterator.java | 43 -
.../common/iterator/FileLineIterable.java | 88 -
.../common/iterator/FileLineIterator.java | 167 -
.../iterator/FixedSizeSamplingIterator.java | 59 -
.../common/iterator/SamplingIterable.java | 45 -
.../common/iterator/SamplingIterator.java | 73 -
.../StableFixedSizeSamplingIterator.java | 72 -
.../common/iterator/StringRecordIterator.java | 55 -
.../iterator/sequencefile/PathFilters.java | 81 -
.../common/iterator/sequencefile/PathType.java | 27 -
.../sequencefile/SequenceFileDirIterable.java | 84 -
.../sequencefile/SequenceFileDirIterator.java | 136 -
.../SequenceFileDirValueIterable.java | 83 -
.../SequenceFileDirValueIterator.java | 159 -
.../sequencefile/SequenceFileIterable.java | 68 -
.../sequencefile/SequenceFileIterator.java | 118 -
.../sequencefile/SequenceFileValueIterable.java | 67 -
.../sequencefile/SequenceFileValueIterator.java | 97 -
.../mahout/common/lucene/AnalyzerUtils.java | 61 -
.../common/lucene/IteratorTokenStream.java | 45 -
.../common/lucene/TokenStreamIterator.java | 57 -
.../common/mapreduce/MergeVectorsCombiner.java | 34 -
.../common/mapreduce/MergeVectorsReducer.java | 40 -
.../common/mapreduce/TransposeMapper.java | 49 -
.../common/mapreduce/VectorSumCombiner.java | 38 -
.../common/mapreduce/VectorSumReducer.java | 35 -
.../org/apache/mahout/common/nlp/NGrams.java | 94 -
.../common/parameters/AbstractParameter.java | 120 -
.../common/parameters/ClassParameter.java | 44 -
.../common/parameters/DoubleParameter.java | 33 -
.../mahout/common/parameters/Parameter.java | 62 -
.../mahout/common/parameters/Parametered.java | 206 -
.../mahout/common/parameters/PathParameter.java | 33 -
.../org/apache/mahout/driver/MahoutDriver.java | 244 -
.../apache/mahout/ep/EvolutionaryProcess.java | 229 -
.../main/java/org/apache/mahout/ep/Mapping.java | 206 -
.../main/java/org/apache/mahout/ep/Payload.java | 36 -
.../main/java/org/apache/mahout/ep/State.java | 302 -
.../java/org/apache/mahout/ep/package-info.java | 26 -
.../mahout/math/DistributedRowMatrixWriter.java | 47 -
.../org/apache/mahout/math/MatrixUtils.java | 114 -
.../mahout/math/MultiLabelVectorWritable.java | 88 -
.../math/als/AlternatingLeastSquaresSolver.java | 116 -
...itFeedbackAlternatingLeastSquaresSolver.java | 171 -
.../math/decomposer/AsyncEigenVerifier.java | 80 -
.../mahout/math/decomposer/EigenStatus.java | 50 -
.../math/decomposer/SimpleEigenVerifier.java | 41 -
.../math/decomposer/SingularVectorVerifier.java | 25 -
.../math/decomposer/hebbian/EigenUpdater.java | 25 -
.../math/decomposer/hebbian/HebbianSolver.java | 342 -
.../math/decomposer/hebbian/HebbianUpdater.java | 71 -
.../math/decomposer/hebbian/TrainingState.java | 143 -
.../math/decomposer/lanczos/LanczosSolver.java | 213 -
.../math/decomposer/lanczos/LanczosState.java | 107 -
.../math/hadoop/DistributedRowMatrix.java | 390 -
.../math/hadoop/MatrixColumnMeansJob.java | 236 -
.../math/hadoop/MatrixMultiplicationJob.java | 177 -
.../mahout/math/hadoop/TimesSquaredJob.java | 251 -
.../apache/mahout/math/hadoop/TransposeJob.java | 85 -
.../decomposer/DistributedLanczosSolver.java | 299 -
.../math/hadoop/decomposer/EigenVector.java | 76 -
.../hadoop/decomposer/EigenVerificationJob.java | 333 -
.../decomposer/HdfsBackedLanczosState.java | 237 -
.../math/hadoop/similarity/SeedVectorUtil.java | 104 -
.../VectorDistanceInvertedMapper.java | 71 -
.../hadoop/similarity/VectorDistanceMapper.java | 80 -
.../similarity/VectorDistanceSimilarityJob.java | 153 -
.../similarity/cooccurrence/MutableElement.java | 50 -
.../cooccurrence/RowSimilarityJob.java | 562 -
.../cooccurrence/TopElementsQueue.java | 59 -
.../hadoop/similarity/cooccurrence/Vectors.java | 199 -
.../measures/CityBlockSimilarity.java | 26 -
.../measures/CooccurrenceCountSimilarity.java | 32 -
.../cooccurrence/measures/CosineSimilarity.java | 50 -
.../measures/CountbasedMeasure.java | 44 -
.../measures/EuclideanDistanceSimilarity.java | 57 -
.../measures/LoglikelihoodSimilarity.java | 34 -
.../measures/PearsonCorrelationSimilarity.java | 37 -
.../measures/TanimotoCoefficientSimilarity.java | 34 -
.../measures/VectorSimilarityMeasure.java | 32 -
.../measures/VectorSimilarityMeasures.java | 46 -
.../DistributedConjugateGradientSolver.java | 172 -
.../mahout/math/hadoop/stats/BasicStats.java | 148 -
.../StandardDeviationCalculatorMapper.java | 55 -
.../StandardDeviationCalculatorReducer.java | 37 -
.../math/hadoop/stats/VarianceTotals.java | 68 -
.../hadoop/stochasticsvd/ABtDenseOutJob.java | 585 -
.../math/hadoop/stochasticsvd/ABtJob.java | 494 -
.../mahout/math/hadoop/stochasticsvd/BtJob.java | 628 -
.../stochasticsvd/DenseBlockWritable.java | 83 -
.../mahout/math/hadoop/stochasticsvd/Omega.java | 257 -
.../mahout/math/hadoop/stochasticsvd/QJob.java | 237 -
.../math/hadoop/stochasticsvd/SSVDCli.java | 201 -
.../math/hadoop/stochasticsvd/SSVDHelper.java | 322 -
.../math/hadoop/stochasticsvd/SSVDSolver.java | 662 -
.../SparseRowBlockAccumulator.java | 90 -
.../stochasticsvd/SparseRowBlockWritable.java | 159 -
.../stochasticsvd/SplitPartitionedWritable.java | 151 -
.../mahout/math/hadoop/stochasticsvd/UJob.java | 170 -
.../mahout/math/hadoop/stochasticsvd/VJob.java | 224 -
.../math/hadoop/stochasticsvd/YtYJob.java | 220 -
.../stochasticsvd/qr/GivensThinSolver.java | 643 -
.../hadoop/stochasticsvd/qr/GramSchmidt.java | 52 -
.../hadoop/stochasticsvd/qr/QRFirstStep.java | 284 -
.../hadoop/stochasticsvd/qr/QRLastStep.java | 144 -
.../mahout/math/neighborhood/BruteSearch.java | 186 -
.../math/neighborhood/FastProjectionSearch.java | 326 -
.../mahout/math/neighborhood/HashedVector.java | 103 -
.../LocalitySensitiveHashSearch.java | 295 -
.../math/neighborhood/ProjectionSearch.java | 233 -
.../mahout/math/neighborhood/Searcher.java | 155 -
.../math/neighborhood/UpdatableSearcher.java | 37 -
.../math/random/AbstractSamplerFunction.java | 39 -
.../mahout/math/random/ChineseRestaurant.java | 111 -
.../apache/mahout/math/random/Empirical.java | 124 -
.../apache/mahout/math/random/IndianBuffet.java | 157 -
.../org/apache/mahout/math/random/Missing.java | 59 -
.../apache/mahout/math/random/MultiNormal.java | 118 -
.../apache/mahout/math/random/Multinomial.java | 202 -
.../org/apache/mahout/math/random/Normal.java | 40 -
.../mahout/math/random/PoissonSampler.java | 67 -
.../mahout/math/random/RandomProjector.java | 133 -
.../org/apache/mahout/math/random/Sampler.java | 25 -
.../mahout/math/random/WeightedThing.java | 71 -
.../mahout/math/ssvd/SequentialBigSvd.java | 69 -
.../math/ssvd/SequentialOutOfCoreSvd.java | 233 -
.../mahout/math/stats/GlobalOnlineAuc.java | 168 -
.../mahout/math/stats/GroupedOnlineAuc.java | 113 -
.../org/apache/mahout/math/stats/OnlineAuc.java | 38 -
.../mahout/math/stats/OnlineSummarizer.java | 93 -
.../org/apache/mahout/math/stats/Sampler.java | 79 -
.../mahout/vectorizer/DictionaryVectorizer.java | 422 -
.../mahout/vectorizer/DocumentProcessor.java | 99 -
.../EncodedVectorsFromSequenceFiles.java | 104 -
.../mahout/vectorizer/EncodingMapper.java | 92 -
.../mahout/vectorizer/HighDFWordsPruner.java | 147 -
.../SimpleTextEncodingVectorizer.java | 72 -
.../SparseVectorsFromSequenceFiles.java | 369 -
.../java/org/apache/mahout/vectorizer/TF.java | 30 -
.../org/apache/mahout/vectorizer/TFIDF.java | 31 -
.../apache/mahout/vectorizer/Vectorizer.java | 29 -
.../mahout/vectorizer/VectorizerConfig.java | 179 -
.../org/apache/mahout/vectorizer/Weight.java | 32 -
.../collocations/llr/CollocCombiner.java | 46 -
.../collocations/llr/CollocDriver.java | 284 -
.../collocations/llr/CollocMapper.java | 178 -
.../collocations/llr/CollocReducer.java | 176 -
.../vectorizer/collocations/llr/Gram.java | 239 -
.../vectorizer/collocations/llr/GramKey.java | 133 -
.../llr/GramKeyGroupComparator.java | 43 -
.../collocations/llr/GramKeyPartitioner.java | 40 -
.../vectorizer/collocations/llr/LLRReducer.java | 170 -
.../common/PartialVectorMergeReducer.java | 89 -
.../vectorizer/common/PartialVectorMerger.java | 144 -
.../document/SequenceFileTokenizerMapper.java | 70 -
.../encoders/AdaptiveWordValueEncoder.java | 69 -
.../encoders/CachingContinuousValueEncoder.java | 64 -
.../encoders/CachingStaticWordValueEncoder.java | 66 -
.../encoders/CachingTextValueEncoder.java | 25 -
.../encoders/CachingValueEncoder.java | 64 -
.../encoders/ConstantValueEncoder.java | 57 -
.../encoders/ContinuousValueEncoder.java | 76 -
.../mahout/vectorizer/encoders/Dictionary.java | 54 -
.../encoders/FeatureVectorEncoder.java | 279 -
.../encoders/InteractionValueEncoder.java | 126 -
.../encoders/LuceneTextValueEncoder.java | 129 -
.../encoders/StaticWordValueEncoder.java | 80 -
.../vectorizer/encoders/TextValueEncoder.java | 142 -
.../vectorizer/encoders/WordValueEncoder.java | 81 -
.../pruner/PrunedPartialVectorMergeReducer.java | 65 -
.../vectorizer/pruner/WordsPrunerReducer.java | 86 -
.../vectorizer/term/TFPartialVectorReducer.java | 139 -
.../vectorizer/term/TermCountCombiner.java | 41 -
.../mahout/vectorizer/term/TermCountMapper.java | 58 -
.../vectorizer/term/TermCountReducer.java | 55 -
.../term/TermDocumentCountMapper.java | 50 -
.../term/TermDocumentCountReducer.java | 41 -
.../mahout/vectorizer/tfidf/TFIDFConverter.java | 361 -
.../tfidf/TFIDFPartialVectorReducer.java | 114 -
.../src/main/resources/supplemental-models.xml | 279 -
community/mahout-mr/src/main/resources/version | 1 -
.../mahout/cf/taste/common/CommonTest.java | 60 -
.../cf/taste/hadoop/TasteHadoopUtilsTest.java | 40 -
.../cf/taste/hadoop/TopItemsQueueTest.java | 72 -
.../als/ParallelALSFactorizationJobTest.java | 379 -
.../cf/taste/hadoop/item/IDReaderTest.java | 66 -
.../taste/hadoop/item/RecommenderJobTest.java | 928 -
.../hadoop/item/ToUserVectorsReducerTest.java | 74 -
.../similarity/item/ItemSimilarityJobTest.java | 269 -
.../mahout/cf/taste/impl/TasteTestCase.java | 98 -
.../mahout/cf/taste/impl/common/BitSetTest.java | 74 -
.../mahout/cf/taste/impl/common/CacheTest.java | 61 -
.../cf/taste/impl/common/FastByIDMapTest.java | 147 -
.../cf/taste/impl/common/FastIDSetTest.java | 162 -
.../cf/taste/impl/common/FastMapTest.java | 228 -
.../impl/common/InvertedRunningAverageTest.java | 88 -
.../common/LongPrimitiveArrayIteratorTest.java | 56 -
.../cf/taste/impl/common/MockRefreshable.java | 45 -
.../cf/taste/impl/common/RefreshHelperTest.java | 70 -
.../common/RunningAverageAndStdDevTest.java | 107 -
.../taste/impl/common/RunningAverageTest.java | 75 -
.../SamplingLongPrimitiveIteratorTest.java | 91 -
.../impl/common/WeightedRunningAverageTest.java | 85 -
...ericRecommenderIRStatsEvaluatorImplTest.java | 73 -
.../taste/impl/eval/LoadEvaluationRunner.java | 68 -
.../model/BooleanItemPreferenceArrayTest.java | 89 -
.../model/BooleanUserPreferenceArrayTest.java | 89 -
.../taste/impl/model/GenericDataModelTest.java | 51 -
.../model/GenericItemPreferenceArrayTest.java | 110 -
.../model/GenericUserPreferenceArrayTest.java | 110 -
.../taste/impl/model/MemoryIDMigratorTest.java | 57 -
...lusAnonymousConcurrentUserDataModelTest.java | 313 -
.../impl/model/file/FileDataModelTest.java | 216 -
.../impl/model/file/FileIDMigratorTest.java | 103 -
.../impl/neighborhood/DummySimilarity.java | 68 -
.../neighborhood/NearestNNeighborhoodTest.java | 53 -
.../neighborhood/ThresholdNeighborhoodTest.java | 51 -
...lUnknownItemsCandidateItemsStrategyTest.java | 65 -
.../recommender/CachingRecommenderTest.java | 78 -
.../GenericItemBasedRecommenderTest.java | 324 -
.../GenericUserBasedRecommenderTest.java | 174 -
.../recommender/ItemAverageRecommenderTest.java | 43 -
.../ItemUserAverageRecommenderTest.java | 43 -
.../taste/impl/recommender/MockRecommender.java | 89 -
.../impl/recommender/NullRescorerTest.java | 47 -
...sNeighborhoodCandidateItemsStrategyTest.java | 75 -
.../impl/recommender/RandomRecommenderTest.java | 41 -
.../impl/recommender/ReversingRescorer.java | 46 -
.../SamplingCandidateItemsStrategyTest.java | 71 -
.../cf/taste/impl/recommender/TopItemsTest.java | 158 -
.../recommender/svd/ALSWRFactorizerTest.java | 208 -
.../svd/FilePersistenceStrategyTest.java | 53 -
.../svd/ParallelSGDFactorizerTest.java | 355 -
.../recommender/svd/SVDRecommenderTest.java | 86 -
.../AveragingPreferenceInferrerTest.java | 37 -
.../EuclideanDistanceSimilarityTest.java | 236 -
.../similarity/GenericItemSimilarityTest.java | 104 -
.../similarity/LogLikelihoodSimilarityTest.java | 80 -
.../PearsonCorrelationSimilarityTest.java | 265 -
.../impl/similarity/SimilarityTestCase.java | 35 -
.../SpearmanCorrelationSimilarityTest.java | 80 -
.../TanimotoCoefficientSimilarityTest.java | 121 -
.../similarity/file/FileItemSimilarityTest.java | 142 -
.../MultithreadedBatchItemSimilaritiesTest.java | 98 -
.../similarity/precompute/SimilarItemsTest.java | 50 -
.../mahout/classifier/ClassifierData.java | 102 -
.../mahout/classifier/ConfusionMatrixTest.java | 119 -
.../RegressionResultAnalyzerTest.java | 128 -
.../classifier/df/DecisionForestTest.java | 206 -
.../df/builder/DecisionTreeBuilderTest.java | 78 -
.../df/builder/DefaultTreeBuilderTest.java | 74 -
.../df/builder/InfiniteRecursionTest.java | 60 -
.../classifier/df/data/DataConverterTest.java | 60 -
.../classifier/df/data/DataLoaderTest.java | 350 -
.../mahout/classifier/df/data/DataTest.java | 396 -
.../mahout/classifier/df/data/DatasetTest.java | 72 -
.../classifier/df/data/DescriptorUtilsTest.java | 92 -
.../apache/mahout/classifier/df/data/Utils.java | 284 -
.../mapreduce/inmem/InMemInputFormatTest.java | 109 -
.../df/mapreduce/inmem/InMemInputSplitTest.java | 77 -
.../mapreduce/partial/PartialBuilderTest.java | 197 -
.../df/mapreduce/partial/Step1MapperTest.java | 160 -
.../df/mapreduce/partial/TreeIDTest.java | 48 -
.../mahout/classifier/df/node/NodeTest.java | 108 -
.../classifier/df/split/DefaultIgSplitTest.java | 78 -
.../df/split/RegressionSplitTest.java | 87 -
.../classifier/df/tools/VisualizerTest.java | 211 -
.../mahout/classifier/evaluation/AucTest.java | 86 -
.../ComplementaryNaiveBayesClassifierTest.java | 47 -
.../naivebayes/NaiveBayesModelTest.java | 36 -
.../classifier/naivebayes/NaiveBayesTest.java | 135 -
.../naivebayes/NaiveBayesTestBase.java | 135 -
.../StandardNaiveBayesClassifierTest.java | 47 -
.../training/IndexInstancesMapperTest.java | 85 -
.../naivebayes/training/ThetaMapperTest.java | 61 -
.../naivebayes/training/WeightsMapperTest.java | 60 -
.../sequencelearning/hmm/HMMAlgorithmsTest.java | 164 -
.../sequencelearning/hmm/HMMEvaluatorTest.java | 63 -
.../sequencelearning/hmm/HMMModelTest.java | 32 -
.../sequencelearning/hmm/HMMTestBase.java | 73 -
.../sequencelearning/hmm/HMMTrainerTest.java | 163 -
.../sequencelearning/hmm/HMMUtilsTest.java | 161 -
.../sgd/AdaptiveLogisticRegressionTest.java | 186 -
.../classifier/sgd/CsvRecordFactoryTest.java | 90 -
.../classifier/sgd/GradientMachineTest.java | 41 -
.../classifier/sgd/ModelSerializerTest.java | 162 -
.../mahout/classifier/sgd/OnlineBaseTest.java | 160 -
.../sgd/OnlineLogisticRegressionTest.java | 330 -
.../classifier/sgd/PassiveAggressiveTest.java | 35 -
.../mahout/clustering/ClusteringTestUtils.java | 152 -
.../mahout/clustering/TestClusterInterface.java | 83 -
.../clustering/TestGaussianAccumulators.java | 186 -
.../clustering/canopy/TestCanopyCreation.java | 674 -
.../ClusterClassificationDriverTest.java | 255 -
.../fuzzykmeans/TestFuzzyKmeansClustering.java | 202 -
.../iterator/TestClusterClassifier.java | 238 -
.../clustering/kmeans/TestKmeansClustering.java | 385 -
.../kmeans/TestRandomSeedGenerator.java | 169 -
.../clustering/lda/cvb/TestCVBModelTrainer.java | 138 -
.../spectral/TestAffinityMatrixInputJob.java | 145 -
.../spectral/TestMatrixDiagonalizeJob.java | 116 -
.../spectral/TestUnitVectorizerJob.java | 65 -
.../clustering/spectral/TestVectorCache.java | 110 -
.../TestVectorMatrixMultiplicationJob.java | 75 -
.../spectral/kmeans/TestEigenSeedGenerator.java | 100 -
.../streaming/cluster/BallKMeansTest.java | 196 -
.../clustering/streaming/cluster/DataUtils.java | 92 -
.../streaming/cluster/StreamingKMeansTest.java | 169 -
.../mapreduce/StreamingKMeansTestMR.java | 282 -
.../tools/ResplitSequenceFilesTest.java | 80 -
.../clustering/topdown/PathDirectoryTest.java | 65 -
.../postprocessor/ClusterCountReaderTest.java | 121 -
.../ClusterOutputPostProcessorTest.java | 205 -
.../apache/mahout/common/AbstractJobTest.java | 240 -
.../DistributedCacheFileLocationTest.java | 46 -
.../mahout/common/DummyOutputCollector.java | 57 -
.../apache/mahout/common/DummyRecordWriter.java | 223 -
.../mahout/common/DummyRecordWriterTest.java | 45 -
.../mahout/common/DummyStatusReporter.java | 76 -
.../mahout/common/IntPairWritableTest.java | 114 -
.../apache/mahout/common/MahoutTestCase.java | 148 -
.../org/apache/mahout/common/MockIterator.java | 51 -
.../apache/mahout/common/StringUtilsTest.java | 70 -
.../distance/CosineDistanceMeasureTest.java | 66 -
.../distance/DefaultDistanceMeasureTest.java | 103 -
.../DefaultWeightedDistanceMeasureTest.java | 56 -
.../common/distance/TestChebyshevMeasure.java | 55 -
.../distance/TestEuclideanDistanceMeasure.java | 26 -
.../TestMahalanobisDistanceMeasure.java | 56 -
.../distance/TestManhattanDistanceMeasure.java | 26 -
.../common/distance/TestMinkowskiMeasure.java | 64 -
.../distance/TestTanimotoDistanceMeasure.java | 25 -
...estWeightedEuclideanDistanceMeasureTest.java | 25 -
.../TestWeightedManhattanDistanceMeasure.java | 26 -
.../common/iterator/CountingIteratorTest.java | 44 -
.../mahout/common/iterator/SamplerCase.java | 101 -
.../common/iterator/TestFixedSizeSampler.java | 33 -
.../common/iterator/TestSamplingIterator.java | 77 -
.../iterator/TestStableFixedSizeSampler.java | 33 -
.../mahout/common/lucene/AnalyzerUtilsTest.java | 38 -
.../apache/mahout/driver/MahoutDriverTest.java | 32 -
.../mahout/ep/EvolutionaryProcessTest.java | 81 -
.../apache/mahout/math/MatrixWritableTest.java | 148 -
.../java/org/apache/mahout/math/VarintTest.java | 189 -
.../apache/mahout/math/VectorWritableTest.java | 123 -
.../apache/mahout/math/hadoop/MathHelper.java | 236 -
.../math/hadoop/TestDistributedRowMatrix.java | 395 -
.../TestDistributedLanczosSolver.java | 132 -
.../TestDistributedLanczosSolverCLI.java | 190 -
.../TestVectorDistanceSimilarityJob.java | 238 -
.../cooccurrence/RowSimilarityJobTest.java | 214 -
.../measures/VectorSimilarityMeasuresTest.java | 133 -
.../TestDistributedConjugateGradientSolver.java | 59 -
...stDistributedConjugateGradientSolverCLI.java | 111 -
.../math/hadoop/stats/BasicStatsTest.java | 121 -
.../stochasticsvd/LocalSSVDPCASparseTest.java | 296 -
.../stochasticsvd/LocalSSVDSolverDenseTest.java | 206 -
.../LocalSSVDSolverSparseSequentialTest.java | 209 -
.../hadoop/stochasticsvd/SSVDCommonTest.java | 105 -
.../hadoop/stochasticsvd/SSVDTestsHelper.java | 172 -
.../LocalitySensitiveHashSearchTest.java | 119 -
.../mahout/math/neighborhood/LumpyData.java | 77 -
.../math/neighborhood/SearchQualityTest.java | 178 -
.../math/neighborhood/SearchSanityTest.java | 244 -
.../math/ssvd/SequentialOutOfCoreSvdTest.java | 195 -
.../apache/mahout/math/stats/OnlineAucTest.java | 127 -
.../apache/mahout/math/stats/SamplerTest.java | 45 -
.../vectorizer/DictionaryVectorizerTest.java | 220 -
.../vectorizer/DocumentProcessorTest.java | 81 -
.../EncodedVectorsFromSequenceFilesTest.java | 126 -
.../vectorizer/HighDFWordsPrunerTest.java | 154 -
.../vectorizer/RandomDocumentGenerator.java | 69 -
.../SparseVectorsFromSequenceFilesTest.java | 203 -
.../collocations/llr/CollocMapperTest.java | 180 -
.../collocations/llr/CollocReducerTest.java | 86 -
.../llr/GramKeyGroupComparatorTest.java | 45 -
.../llr/GramKeyPartitionerTest.java | 54 -
.../collocations/llr/GramKeyTest.java | 106 -
.../vectorizer/collocations/llr/GramTest.java | 215 -
.../collocations/llr/LLRReducerTest.java | 116 -
.../vectorizer/encoders/CachingEncoderTest.java | 48 -
.../encoders/ConstantValueEncoderTest.java | 74 -
.../encoders/ContinuousValueEncoderTest.java | 88 -
.../encoders/InteractionValueEncoderTest.java | 103 -
.../encoders/TextValueEncoderTest.java | 99 -
.../encoders/WordLikeValueEncoderTest.java | 99 -
.../mahout-mr/src/test/resources/FPGsynth.dat | 193 -
.../mahout-mr/src/test/resources/cancer.csv | 684 -
community/mahout-mr/src/test/resources/iris.csv | 151 -
.../mahout-mr/src/test/resources/retail.dat | 88162 -----------------
.../retail_results_with_min_sup_100.dat | 6438 --
community/mahout-mr/src/test/resources/sgd.csv | 61 -
.../mahout-mr/src/test/resources/word-list.txt | 512 -
engine/hdfs/pom.xml | 26 +-
pom.xml | 22 +-
1838 files changed, 304042 insertions(+), 304003 deletions(-)
----------------------------------------------------------------------
r***@apache.org
2018-06-28 14:55:01 UTC
Permalink
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/TrainLogistic.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/TrainLogistic.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/TrainLogistic.java
new file mode 100644
index 0000000..f4b8bcb
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/TrainLogistic.java
@@ -0,0 +1,311 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import com.google.common.io.Resources;
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.commandline.Parser;
+import org.apache.commons.cli2.util.HelpFormatter;
+import org.apache.commons.io.Charsets;
+import org.apache.mahout.math.RandomAccessSparseVector;
+import org.apache.mahout.math.Vector;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.OutputStream;
+import java.io.OutputStreamWriter;
+import java.io.PrintWriter;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Locale;
+
+/**
+ * Train a logistic regression for the examples from Chapter 13 of Mahout in Action
+ */
+public final class TrainLogistic {
+
+ private static String inputFile;
+ private static String outputFile;
+ private static LogisticModelParameters lmp;
+ private static int passes;
+ private static boolean scores;
+ private static OnlineLogisticRegression model;
+
+ private TrainLogistic() {
+ }
+
+ public static void main(String[] args) throws Exception {
+ mainToOutput(args, new PrintWriter(new OutputStreamWriter(System.out, Charsets.UTF_8), true));
+ }
+
+ static void mainToOutput(String[] args, PrintWriter output) throws Exception {
+ if (parseArgs(args)) {
+ double logPEstimate = 0;
+ int samples = 0;
+
+ CsvRecordFactory csv = lmp.getCsvRecordFactory();
+ OnlineLogisticRegression lr = lmp.createRegression();
+ for (int pass = 0; pass < passes; pass++) {
+ try (BufferedReader in = open(inputFile)) {
+ // read variable names
+ csv.firstLine(in.readLine());
+
+ String line = in.readLine();
+ while (line != null) {
+ // for each new line, get target and predictors
+ Vector input = new RandomAccessSparseVector(lmp.getNumFeatures());
+ int targetValue = csv.processLine(line, input);
+
+ // check performance while this is still news
+ double logP = lr.logLikelihood(targetValue, input);
+ if (!Double.isInfinite(logP)) {
+ if (samples < 20) {
+ logPEstimate = (samples * logPEstimate + logP) / (samples + 1);
+ } else {
+ logPEstimate = 0.95 * logPEstimate + 0.05 * logP;
+ }
+ samples++;
+ }
+ double p = lr.classifyScalar(input);
+ if (scores) {
+ output.printf(Locale.ENGLISH, "%10d %2d %10.2f %2.4f %10.4f %10.4f%n",
+ samples, targetValue, lr.currentLearningRate(), p, logP, logPEstimate);
+ }
+
+ // now update model
+ lr.train(targetValue, input);
+
+ line = in.readLine();
+ }
+ }
+ }
+
+ try (OutputStream modelOutput = new FileOutputStream(outputFile)) {
+ lmp.saveTo(modelOutput);
+ }
+
+ output.println(lmp.getNumFeatures());
+ output.println(lmp.getTargetVariable() + " ~ ");
+ String sep = "";
+ for (String v : csv.getTraceDictionary().keySet()) {
+ double weight = predictorWeight(lr, 0, csv, v);
+ if (weight != 0) {
+ output.printf(Locale.ENGLISH, "%s%.3f*%s", sep, weight, v);
+ sep = " + ";
+ }
+ }
+ output.printf("%n");
+ model = lr;
+ for (int row = 0; row < lr.getBeta().numRows(); row++) {
+ for (String key : csv.getTraceDictionary().keySet()) {
+ double weight = predictorWeight(lr, row, csv, key);
+ if (weight != 0) {
+ output.printf(Locale.ENGLISH, "%20s %.5f%n", key, weight);
+ }
+ }
+ for (int column = 0; column < lr.getBeta().numCols(); column++) {
+ output.printf(Locale.ENGLISH, "%15.9f ", lr.getBeta().get(row, column));
+ }
+ output.println();
+ }
+ }
+ }
+
+ private static double predictorWeight(OnlineLogisticRegression lr, int row, RecordFactory csv, String predictor) {
+ double weight = 0;
+ for (Integer column : csv.getTraceDictionary().get(predictor)) {
+ weight += lr.getBeta().get(row, column);
+ }
+ return weight;
+ }
+
+ private static boolean parseArgs(String[] args) {
+ DefaultOptionBuilder builder = new DefaultOptionBuilder();
+
+ Option help = builder.withLongName("help").withDescription("print this list").create();
+
+ Option quiet = builder.withLongName("quiet").withDescription("be extra quiet").create();
+ Option scores = builder.withLongName("scores").withDescription("output score diagnostics during training").create();
+
+ ArgumentBuilder argumentBuilder = new ArgumentBuilder();
+ Option inputFile = builder.withLongName("input")
+ .withRequired(true)
+ .withArgument(argumentBuilder.withName("input").withMaximum(1).create())
+ .withDescription("where to get training data")
+ .create();
+
+ Option outputFile = builder.withLongName("output")
+ .withRequired(true)
+ .withArgument(argumentBuilder.withName("output").withMaximum(1).create())
+ .withDescription("where to get training data")
+ .create();
+
+ Option predictors = builder.withLongName("predictors")
+ .withRequired(true)
+ .withArgument(argumentBuilder.withName("p").create())
+ .withDescription("a list of predictor variables")
+ .create();
+
+ Option types = builder.withLongName("types")
+ .withRequired(true)
+ .withArgument(argumentBuilder.withName("t").create())
+ .withDescription("a list of predictor variable types (numeric, word, or text)")
+ .create();
+
+ Option target = builder.withLongName("target")
+ .withRequired(true)
+ .withArgument(argumentBuilder.withName("target").withMaximum(1).create())
+ .withDescription("the name of the target variable")
+ .create();
+
+ Option features = builder.withLongName("features")
+ .withArgument(
+ argumentBuilder.withName("numFeatures")
+ .withDefault("1000")
+ .withMaximum(1).create())
+ .withDescription("the number of internal hashed features to use")
+ .create();
+
+ Option passes = builder.withLongName("passes")
+ .withArgument(
+ argumentBuilder.withName("passes")
+ .withDefault("2")
+ .withMaximum(1).create())
+ .withDescription("the number of times to pass over the input data")
+ .create();
+
+ Option lambda = builder.withLongName("lambda")
+ .withArgument(argumentBuilder.withName("lambda").withDefault("1e-4").withMaximum(1).create())
+ .withDescription("the amount of coefficient decay to use")
+ .create();
+
+ Option rate = builder.withLongName("rate")
+ .withArgument(argumentBuilder.withName("learningRate").withDefault("1e-3").withMaximum(1).create())
+ .withDescription("the learning rate")
+ .create();
+
+ Option noBias = builder.withLongName("noBias")
+ .withDescription("don't include a bias term")
+ .create();
+
+ Option targetCategories = builder.withLongName("categories")
+ .withRequired(true)
+ .withArgument(argumentBuilder.withName("number").withMaximum(1).create())
+ .withDescription("the number of target categories to be considered")
+ .create();
+
+ Group normalArgs = new GroupBuilder()
+ .withOption(help)
+ .withOption(quiet)
+ .withOption(inputFile)
+ .withOption(outputFile)
+ .withOption(target)
+ .withOption(targetCategories)
+ .withOption(predictors)
+ .withOption(types)
+ .withOption(passes)
+ .withOption(lambda)
+ .withOption(rate)
+ .withOption(noBias)
+ .withOption(features)
+ .create();
+
+ Parser parser = new Parser();
+ parser.setHelpOption(help);
+ parser.setHelpTrigger("--help");
+ parser.setGroup(normalArgs);
+ parser.setHelpFormatter(new HelpFormatter(" ", "", " ", 130));
+ CommandLine cmdLine = parser.parseAndHelp(args);
+
+ if (cmdLine == null) {
+ return false;
+ }
+
+ TrainLogistic.inputFile = getStringArgument(cmdLine, inputFile);
+ TrainLogistic.outputFile = getStringArgument(cmdLine, outputFile);
+
+ List<String> typeList = new ArrayList<>();
+ for (Object x : cmdLine.getValues(types)) {
+ typeList.add(x.toString());
+ }
+
+ List<String> predictorList = new ArrayList<>();
+ for (Object x : cmdLine.getValues(predictors)) {
+ predictorList.add(x.toString());
+ }
+
+ lmp = new LogisticModelParameters();
+ lmp.setTargetVariable(getStringArgument(cmdLine, target));
+ lmp.setMaxTargetCategories(getIntegerArgument(cmdLine, targetCategories));
+ lmp.setNumFeatures(getIntegerArgument(cmdLine, features));
+ lmp.setUseBias(!getBooleanArgument(cmdLine, noBias));
+ lmp.setTypeMap(predictorList, typeList);
+
+ lmp.setLambda(getDoubleArgument(cmdLine, lambda));
+ lmp.setLearningRate(getDoubleArgument(cmdLine, rate));
+
+ TrainLogistic.scores = getBooleanArgument(cmdLine, scores);
+ TrainLogistic.passes = getIntegerArgument(cmdLine, passes);
+
+ return true;
+ }
+
+ private static String getStringArgument(CommandLine cmdLine, Option inputFile) {
+ return (String) cmdLine.getValue(inputFile);
+ }
+
+ private static boolean getBooleanArgument(CommandLine cmdLine, Option option) {
+ return cmdLine.hasOption(option);
+ }
+
+ private static int getIntegerArgument(CommandLine cmdLine, Option features) {
+ return Integer.parseInt((String) cmdLine.getValue(features));
+ }
+
+ private static double getDoubleArgument(CommandLine cmdLine, Option op) {
+ return Double.parseDouble((String) cmdLine.getValue(op));
+ }
+
+ public static OnlineLogisticRegression getModel() {
+ return model;
+ }
+
+ public static LogisticModelParameters getParameters() {
+ return lmp;
+ }
+
+ static BufferedReader open(String inputFile) throws IOException {
+ InputStream in;
+ try {
+ in = Resources.getResource(inputFile).openStream();
+ } catch (IllegalArgumentException e) {
+ in = new FileInputStream(new File(inputFile));
+ }
+ return new BufferedReader(new InputStreamReader(in, Charsets.UTF_8));
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/TrainNewsGroups.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/TrainNewsGroups.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/TrainNewsGroups.java
new file mode 100644
index 0000000..632b32c
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/TrainNewsGroups.java
@@ -0,0 +1,154 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import com.google.common.collect.HashMultiset;
+import com.google.common.collect.Multiset;
+import com.google.common.collect.Ordering;
+import org.apache.mahout.classifier.NewsgroupHelper;
+import org.apache.mahout.ep.State;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.vectorizer.encoders.Dictionary;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+
+/**
+ * Reads and trains an adaptive logistic regression model on the 20 newsgroups data.
+ * The first command line argument gives the path of the directory holding the training
+ * data. The optional second argument, leakType, defines which classes of features to use.
+ * Importantly, leakType controls whether a synthetic date is injected into the data as
+ * a target leak and if so, how.
+ * <p/>
+ * The value of leakType % 3 determines whether the target leak is injected according to
+ * the following table:
+ * <p/>
+ * <table>
+ * <tr><td valign='top'>0</td><td>No leak injected</td></tr>
+ * <tr><td valign='top'>1</td><td>Synthetic date injected in MMM-yyyy format. This will be a single token and
+ * is a perfect target leak since each newsgroup is given a different month</td></tr>
+ * <tr><td valign='top'>2</td><td>Synthetic date injected in dd-MMM-yyyy HH:mm:ss format. The day varies
+ * and thus there are more leak symbols that need to be learned. Ultimately this is just
+ * as big a leak as case 1.</td></tr>
+ * </table>
+ * <p/>
+ * Leaktype also determines what other text will be indexed. If leakType is greater
+ * than or equal to 6, then neither headers nor text body will be used for features and the leak is the only
+ * source of data. If leakType is greater than or equal to 3, then subject words will be used as features.
+ * If leakType is less than 3, then both subject and body text will be used as features.
+ * <p/>
+ * A leakType of 0 gives no leak and all textual features.
+ * <p/>
+ * See the following table for a summary of commonly used values for leakType
+ * <p/>
+ * <table>
+ * <tr><td><b>leakType</b></td><td><b>Leak?</b></td><td><b>Subject?</b></td><td><b>Body?</b></td></tr>
+ * <tr><td colspan=4><hr></td></tr>
+ * <tr><td>0</td><td>no</td><td>yes</td><td>yes</td></tr>
+ * <tr><td>1</td><td>mmm-yyyy</td><td>yes</td><td>yes</td></tr>
+ * <tr><td>2</td><td>dd-mmm-yyyy</td><td>yes</td><td>yes</td></tr>
+ * <tr><td colspan=4><hr></td></tr>
+ * <tr><td>3</td><td>no</td><td>yes</td><td>no</td></tr>
+ * <tr><td>4</td><td>mmm-yyyy</td><td>yes</td><td>no</td></tr>
+ * <tr><td>5</td><td>dd-mmm-yyyy</td><td>yes</td><td>no</td></tr>
+ * <tr><td colspan=4><hr></td></tr>
+ * <tr><td>6</td><td>no</td><td>no</td><td>no</td></tr>
+ * <tr><td>7</td><td>mmm-yyyy</td><td>no</td><td>no</td></tr>
+ * <tr><td>8</td><td>dd-mmm-yyyy</td><td>no</td><td>no</td></tr>
+ * <tr><td colspan=4><hr></td></tr>
+ * </table>
+ */
+public final class TrainNewsGroups {
+
+ private TrainNewsGroups() {
+ }
+
+ public static void main(String[] args) throws IOException {
+ File base = new File(args[0]);
+
+ Multiset<String> overallCounts = HashMultiset.create();
+
+ int leakType = 0;
+ if (args.length > 1) {
+ leakType = Integer.parseInt(args[1]);
+ }
+
+ Dictionary newsGroups = new Dictionary();
+
+ NewsgroupHelper helper = new NewsgroupHelper();
+ helper.getEncoder().setProbes(2);
+ AdaptiveLogisticRegression learningAlgorithm =
+ new AdaptiveLogisticRegression(20, NewsgroupHelper.FEATURES, new L1());
+ learningAlgorithm.setInterval(800);
+ learningAlgorithm.setAveragingWindow(500);
+
+ List<File> files = new ArrayList<>();
+ for (File newsgroup : base.listFiles()) {
+ if (newsgroup.isDirectory()) {
+ newsGroups.intern(newsgroup.getName());
+ files.addAll(Arrays.asList(newsgroup.listFiles()));
+ }
+ }
+ Collections.shuffle(files);
+ System.out.println(files.size() + " training files");
+ SGDInfo info = new SGDInfo();
+
+ int k = 0;
+
+ for (File file : files) {
+ String ng = file.getParentFile().getName();
+ int actual = newsGroups.intern(ng);
+
+ Vector v = helper.encodeFeatureVector(file, actual, leakType, overallCounts);
+ learningAlgorithm.train(actual, v);
+
+ k++;
+ State<AdaptiveLogisticRegression.Wrapper, CrossFoldLearner> best = learningAlgorithm.getBest();
+
+ SGDHelper.analyzeState(info, leakType, k, best);
+ }
+ learningAlgorithm.close();
+ SGDHelper.dissect(leakType, newsGroups, learningAlgorithm, files, overallCounts);
+ System.out.println("exiting main");
+
+ File modelFile = new File(System.getProperty("java.io.tmpdir"), "news-group.model");
+ ModelSerializer.writeBinary(modelFile.getAbsolutePath(),
+ learningAlgorithm.getBest().getPayload().getLearner().getModels().get(0));
+
+ List<Integer> counts = new ArrayList<>();
+ System.out.println("Word counts");
+ for (String count : overallCounts.elementSet()) {
+ counts.add(overallCounts.count(count));
+ }
+ Collections.sort(counts, Ordering.natural().reverse());
+ k = 0;
+ for (Integer count : counts) {
+ System.out.println(k + "\t" + count);
+ k++;
+ if (k > 1000) {
+ break;
+ }
+ }
+ }
+
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/ValidateAdaptiveLogistic.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/ValidateAdaptiveLogistic.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/ValidateAdaptiveLogistic.java
new file mode 100644
index 0000000..7a74289
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/ValidateAdaptiveLogistic.java
@@ -0,0 +1,218 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.PrintWriter;
+import java.util.Locale;
+
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.commandline.Parser;
+import org.apache.commons.cli2.util.HelpFormatter;
+import org.apache.commons.io.Charsets;
+import org.apache.mahout.classifier.ConfusionMatrix;
+import org.apache.mahout.classifier.evaluation.Auc;
+import org.apache.mahout.classifier.sgd.AdaptiveLogisticRegression.Wrapper;
+import org.apache.mahout.ep.State;
+import org.apache.mahout.math.Matrix;
+import org.apache.mahout.math.SequentialAccessSparseVector;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.stats.OnlineSummarizer;
+
+/*
+ * Auc and averageLikelihood are always shown if possible, if the number of target value is more than 2,
+ * then Auc and entropy matirx are not shown regardless the value of showAuc and showEntropy
+ * the user passes, because the current implementation does not support them on two value targets.
+ * */
+public final class ValidateAdaptiveLogistic {
+
+ private static String inputFile;
+ private static String modelFile;
+ private static String defaultCategory;
+ private static boolean showAuc;
+ private static boolean showScores;
+ private static boolean showConfusion;
+
+ private ValidateAdaptiveLogistic() {
+ }
+
+ public static void main(String[] args) throws IOException {
+ mainToOutput(args, new PrintWriter(new OutputStreamWriter(System.out, Charsets.UTF_8), true));
+ }
+
+ static void mainToOutput(String[] args, PrintWriter output) throws IOException {
+ if (parseArgs(args)) {
+ if (!showAuc && !showConfusion && !showScores) {
+ showAuc = true;
+ showConfusion = true;
+ }
+
+ Auc collector = null;
+ AdaptiveLogisticModelParameters lmp = AdaptiveLogisticModelParameters
+ .loadFromFile(new File(modelFile));
+ CsvRecordFactory csv = lmp.getCsvRecordFactory();
+ AdaptiveLogisticRegression lr = lmp.createAdaptiveLogisticRegression();
+
+ if (lmp.getTargetCategories().size() <= 2) {
+ collector = new Auc();
+ }
+
+ OnlineSummarizer slh = new OnlineSummarizer();
+ ConfusionMatrix cm = new ConfusionMatrix(lmp.getTargetCategories(), defaultCategory);
+
+ State<Wrapper, CrossFoldLearner> best = lr.getBest();
+ if (best == null) {
+ output.println("AdaptiveLogisticRegression has not be trained probably.");
+ return;
+ }
+ CrossFoldLearner learner = best.getPayload().getLearner();
+
+ BufferedReader in = TrainLogistic.open(inputFile);
+ String line = in.readLine();
+ csv.firstLine(line);
+ line = in.readLine();
+ if (showScores) {
+ output.println("\"target\", \"model-output\", \"log-likelihood\", \"average-likelihood\"");
+ }
+ while (line != null) {
+ Vector v = new SequentialAccessSparseVector(lmp.getNumFeatures());
+ //TODO: How to avoid extra target values not shown in the training process.
+ int target = csv.processLine(line, v);
+ double likelihood = learner.logLikelihood(target, v);
+ double score = learner.classifyFull(v).maxValue();
+
+ slh.add(likelihood);
+ cm.addInstance(csv.getTargetString(line), csv.getTargetLabel(target));
+
+ if (showScores) {
+ output.printf(Locale.ENGLISH, "%8d, %.12f, %.13f, %.13f%n", target,
+ score, learner.logLikelihood(target, v), slh.getMean());
+ }
+ if (collector != null) {
+ collector.add(target, score);
+ }
+ line = in.readLine();
+ }
+
+ output.printf(Locale.ENGLISH,"\nLog-likelihood:");
+ output.printf(Locale.ENGLISH, "Min=%.2f, Max=%.2f, Mean=%.2f, Median=%.2f%n",
+ slh.getMin(), slh.getMax(), slh.getMean(), slh.getMedian());
+
+ if (collector != null) {
+ output.printf(Locale.ENGLISH, "%nAUC = %.2f%n", collector.auc());
+ }
+
+ if (showConfusion) {
+ output.printf(Locale.ENGLISH, "%n%s%n%n", cm.toString());
+
+ if (collector != null) {
+ Matrix m = collector.entropy();
+ output.printf(Locale.ENGLISH,
+ "Entropy Matrix: [[%.1f, %.1f], [%.1f, %.1f]]%n", m.get(0, 0),
+ m.get(1, 0), m.get(0, 1), m.get(1, 1));
+ }
+ }
+
+ }
+ }
+
+ private static boolean parseArgs(String[] args) {
+ DefaultOptionBuilder builder = new DefaultOptionBuilder();
+
+ Option help = builder.withLongName("help")
+ .withDescription("print this list").create();
+
+ Option quiet = builder.withLongName("quiet")
+ .withDescription("be extra quiet").create();
+
+ Option auc = builder.withLongName("auc").withDescription("print AUC")
+ .create();
+ Option confusion = builder.withLongName("confusion")
+ .withDescription("print confusion matrix").create();
+
+ Option scores = builder.withLongName("scores")
+ .withDescription("print scores").create();
+
+ ArgumentBuilder argumentBuilder = new ArgumentBuilder();
+ Option inputFileOption = builder
+ .withLongName("input")
+ .withRequired(true)
+ .withArgument(
+ argumentBuilder.withName("input").withMaximum(1)
+ .create())
+ .withDescription("where to get validate data").create();
+
+ Option modelFileOption = builder
+ .withLongName("model")
+ .withRequired(true)
+ .withArgument(
+ argumentBuilder.withName("model").withMaximum(1)
+ .create())
+ .withDescription("where to get the trained model").create();
+
+ Option defaultCagetoryOption = builder
+ .withLongName("defaultCategory")
+ .withRequired(false)
+ .withArgument(
+ argumentBuilder.withName("defaultCategory").withMaximum(1).withDefault("unknown")
+ .create())
+ .withDescription("the default category value to use").create();
+
+ Group normalArgs = new GroupBuilder().withOption(help)
+ .withOption(quiet).withOption(auc).withOption(scores)
+ .withOption(confusion).withOption(inputFileOption)
+ .withOption(modelFileOption).withOption(defaultCagetoryOption).create();
+
+ Parser parser = new Parser();
+ parser.setHelpOption(help);
+ parser.setHelpTrigger("--help");
+ parser.setGroup(normalArgs);
+ parser.setHelpFormatter(new HelpFormatter(" ", "", " ", 130));
+ CommandLine cmdLine = parser.parseAndHelp(args);
+
+ if (cmdLine == null) {
+ return false;
+ }
+
+ inputFile = getStringArgument(cmdLine, inputFileOption);
+ modelFile = getStringArgument(cmdLine, modelFileOption);
+ defaultCategory = getStringArgument(cmdLine, defaultCagetoryOption);
+ showAuc = getBooleanArgument(cmdLine, auc);
+ showScores = getBooleanArgument(cmdLine, scores);
+ showConfusion = getBooleanArgument(cmdLine, confusion);
+
+ return true;
+ }
+
+ private static boolean getBooleanArgument(CommandLine cmdLine, Option option) {
+ return cmdLine.hasOption(option);
+ }
+
+ private static String getStringArgument(CommandLine cmdLine, Option inputFile) {
+ return (String) cmdLine.getValue(inputFile);
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/bankmarketing/BankMarketingClassificationMain.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/bankmarketing/BankMarketingClassificationMain.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/bankmarketing/BankMarketingClassificationMain.java
new file mode 100644
index 0000000..ab3c861
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/bankmarketing/BankMarketingClassificationMain.java
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd.bankmarketing;
+
+import com.google.common.collect.Lists;
+import org.apache.mahout.classifier.evaluation.Auc;
+import org.apache.mahout.classifier.sgd.L1;
+import org.apache.mahout.classifier.sgd.OnlineLogisticRegression;
+
+import java.util.Collections;
+import java.util.List;
+
+/**
+ * Uses the SGD classifier on the 'Bank marketing' dataset from UCI.
+ *
+ * See http://archive.ics.uci.edu/ml/datasets/Bank+Marketing
+ *
+ * Learn when people accept or reject an offer from the bank via telephone based on income, age, education and more.
+ */
+public class BankMarketingClassificationMain {
+
+ public static final int NUM_CATEGORIES = 2;
+
+ public static void main(String[] args) throws Exception {
+ List<TelephoneCall> calls = Lists.newArrayList(new TelephoneCallParser("bank-full.csv"));
+
+ double heldOutPercentage = 0.10;
+
+ for (int run = 0; run < 20; run++) {
+ Collections.shuffle(calls);
+ int cutoff = (int) (heldOutPercentage * calls.size());
+ List<TelephoneCall> test = calls.subList(0, cutoff);
+ List<TelephoneCall> train = calls.subList(cutoff, calls.size());
+
+ OnlineLogisticRegression lr = new OnlineLogisticRegression(NUM_CATEGORIES, TelephoneCall.FEATURES, new L1())
+ .learningRate(1)
+ .alpha(1)
+ .lambda(0.000001)
+ .stepOffset(10000)
+ .decayExponent(0.2);
+ for (int pass = 0; pass < 20; pass++) {
+ for (TelephoneCall observation : train) {
+ lr.train(observation.getTarget(), observation.asVector());
+ }
+ if (pass % 5 == 0) {
+ Auc eval = new Auc(0.5);
+ for (TelephoneCall testCall : test) {
+ eval.add(testCall.getTarget(), lr.classifyScalar(testCall.asVector()));
+ }
+ System.out.printf("%d, %.4f, %.4f\n", pass, lr.currentLearningRate(), eval.auc());
+ }
+ }
+ }
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/bankmarketing/TelephoneCall.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/bankmarketing/TelephoneCall.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/bankmarketing/TelephoneCall.java
new file mode 100644
index 0000000..728ec20
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/bankmarketing/TelephoneCall.java
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd.bankmarketing;
+
+import org.apache.mahout.math.RandomAccessSparseVector;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.vectorizer.encoders.ConstantValueEncoder;
+import org.apache.mahout.vectorizer.encoders.FeatureVectorEncoder;
+import org.apache.mahout.vectorizer.encoders.StaticWordValueEncoder;
+
+import java.util.Iterator;
+import java.util.LinkedHashMap;
+import java.util.Map;
+
+public class TelephoneCall {
+ public static final int FEATURES = 100;
+ private static final ConstantValueEncoder interceptEncoder = new ConstantValueEncoder("intercept");
+ private static final FeatureVectorEncoder featureEncoder = new StaticWordValueEncoder("feature");
+
+ private RandomAccessSparseVector vector;
+
+ private Map<String, String> fields = new LinkedHashMap<>();
+
+ public TelephoneCall(Iterable<String> fieldNames, Iterable<String> values) {
+ vector = new RandomAccessSparseVector(FEATURES);
+ Iterator<String> value = values.iterator();
+ interceptEncoder.addToVector("1", vector);
+ for (String name : fieldNames) {
+ String fieldValue = value.next();
+ fields.put(name, fieldValue);
+
+ switch (name) {
+ case "age": {
+ double v = Double.parseDouble(fieldValue);
+ featureEncoder.addToVector(name, Math.log(v), vector);
+ break;
+ }
+ case "balance": {
+ double v;
+ v = Double.parseDouble(fieldValue);
+ if (v < -2000) {
+ v = -2000;
+ }
+ featureEncoder.addToVector(name, Math.log(v + 2001) - 8, vector);
+ break;
+ }
+ case "duration": {
+ double v;
+ v = Double.parseDouble(fieldValue);
+ featureEncoder.addToVector(name, Math.log(v + 1) - 5, vector);
+ break;
+ }
+ case "pdays": {
+ double v;
+ v = Double.parseDouble(fieldValue);
+ featureEncoder.addToVector(name, Math.log(v + 2), vector);
+ break;
+ }
+ case "job":
+ case "marital":
+ case "education":
+ case "default":
+ case "housing":
+ case "loan":
+ case "contact":
+ case "campaign":
+ case "previous":
+ case "poutcome":
+ featureEncoder.addToVector(name + ":" + fieldValue, 1, vector);
+ break;
+ case "day":
+ case "month":
+ case "y":
+ // ignore these for vectorizing
+ break;
+ default:
+ throw new IllegalArgumentException(String.format("Bad field name: %s", name));
+ }
+ }
+ }
+
+ public Vector asVector() {
+ return vector;
+ }
+
+ public int getTarget() {
+ return fields.get("y").equals("no") ? 0 : 1;
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/bankmarketing/TelephoneCallParser.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/bankmarketing/TelephoneCallParser.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/bankmarketing/TelephoneCallParser.java
new file mode 100644
index 0000000..5ef6490
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/classifier/sgd/bankmarketing/TelephoneCallParser.java
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.classifier.sgd.bankmarketing;
+
+import com.google.common.base.CharMatcher;
+import com.google.common.base.Splitter;
+import com.google.common.collect.AbstractIterator;
+import com.google.common.io.Resources;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.util.Iterator;
+
+/** Parses semi-colon separated data as TelephoneCalls */
+public class TelephoneCallParser implements Iterable<TelephoneCall> {
+
+ private final Splitter onSemi = Splitter.on(";").trimResults(CharMatcher.anyOf("\" ;"));
+ private String resourceName;
+
+ public TelephoneCallParser(String resourceName) throws IOException {
+ this.resourceName = resourceName;
+ }
+
+ @Override
+ public Iterator<TelephoneCall> iterator() {
+ try {
+ return new AbstractIterator<TelephoneCall>() {
+ BufferedReader input =
+ new BufferedReader(new InputStreamReader(Resources.getResource(resourceName).openStream()));
+ Iterable<String> fieldNames = onSemi.split(input.readLine());
+
+ @Override
+ protected TelephoneCall computeNext() {
+ try {
+ String line = input.readLine();
+ if (line == null) {
+ return endOfData();
+ }
+
+ return new TelephoneCall(fieldNames, onSemi.split(line));
+ } catch (IOException e) {
+ throw new RuntimeException("Error reading data", e);
+ }
+ }
+ };
+ } catch (IOException e) {
+ throw new RuntimeException("Error reading data", e);
+ }
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/display/ClustersFilter.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/display/ClustersFilter.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/display/ClustersFilter.java
new file mode 100644
index 0000000..a0b845f
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/display/ClustersFilter.java
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.display;
+
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.PathFilter;
+
+final class ClustersFilter implements PathFilter {
+
+ @Override
+ public boolean accept(Path path) {
+ String pathString = path.toString();
+ return pathString.contains("/clusters-");
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/display/DisplayCanopy.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/display/DisplayCanopy.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/display/DisplayCanopy.java
new file mode 100644
index 0000000..50dba99
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/display/DisplayCanopy.java
@@ -0,0 +1,88 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.display;
+
+import java.awt.BasicStroke;
+import java.awt.Color;
+import java.awt.Graphics;
+import java.awt.Graphics2D;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.mahout.clustering.Cluster;
+import org.apache.mahout.clustering.canopy.CanopyDriver;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.common.distance.ManhattanDistanceMeasure;
+import org.apache.mahout.math.DenseVector;
+
+/**
+ * Java desktop graphics class that runs canopy clustering and displays the results.
+ * This class generates random data and clusters it.
+ */
+@Deprecated
+public class DisplayCanopy extends DisplayClustering {
+
+ DisplayCanopy() {
+ initialize();
+ this.setTitle("Canopy Clusters (>" + (int) (significance * 100) + "% of population)");
+ }
+
+ @Override
+ public void paint(Graphics g) {
+ plotSampleData((Graphics2D) g);
+ plotClusters((Graphics2D) g);
+ }
+
+ protected static void plotClusters(Graphics2D g2) {
+ int cx = CLUSTERS.size() - 1;
+ for (List<Cluster> clusters : CLUSTERS) {
+ for (Cluster cluster : clusters) {
+ if (isSignificant(cluster)) {
+ g2.setStroke(new BasicStroke(1));
+ g2.setColor(Color.BLUE);
+ double[] t1 = {T1, T1};
+ plotEllipse(g2, cluster.getCenter(), new DenseVector(t1));
+ double[] t2 = {T2, T2};
+ plotEllipse(g2, cluster.getCenter(), new DenseVector(t2));
+ g2.setColor(COLORS[Math.min(DisplayClustering.COLORS.length - 1, cx)]);
+ g2.setStroke(new BasicStroke(cx == 0 ? 3 : 1));
+ plotEllipse(g2, cluster.getCenter(), cluster.getRadius().times(3));
+ }
+ }
+ cx--;
+ }
+ }
+
+ public static void main(String[] args) throws Exception {
+ Path samples = new Path("samples");
+ Path output = new Path("output");
+ Configuration conf = new Configuration();
+ HadoopUtil.delete(conf, samples);
+ HadoopUtil.delete(conf, output);
+ RandomUtils.useTestSeed();
+ generateSamples();
+ writeSampleData(samples);
+ CanopyDriver.buildClusters(conf, samples, output, new ManhattanDistanceMeasure(), T1, T2, 0, true);
+ loadClustersWritable(output);
+
+ new DisplayCanopy();
+ }
+
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/display/DisplayClustering.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/display/DisplayClustering.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/display/DisplayClustering.java
new file mode 100644
index 0000000..ad85c6a
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/display/DisplayClustering.java
@@ -0,0 +1,374 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.display;
+
+import java.awt.*;
+import java.awt.event.WindowAdapter;
+import java.awt.event.WindowEvent;
+import java.awt.geom.AffineTransform;
+import java.awt.geom.Ellipse2D;
+import java.awt.geom.Rectangle2D;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.mahout.clustering.AbstractCluster;
+import org.apache.mahout.clustering.Cluster;
+import org.apache.mahout.clustering.UncommonDistributions;
+import org.apache.mahout.clustering.classify.WeightedVectorWritable;
+import org.apache.mahout.clustering.iterator.ClusterWritable;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
+import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterable;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class DisplayClustering extends Frame {
+
+ private static final Logger log = LoggerFactory.getLogger(DisplayClustering.class);
+
+ protected static final int DS = 72; // default scale = 72 pixels per inch
+
+ protected static final int SIZE = 8; // screen size in inches
+
+ private static final Collection<Vector> SAMPLE_PARAMS = new ArrayList<>();
+
+ protected static final List<VectorWritable> SAMPLE_DATA = new ArrayList<>();
+
+ protected static final List<List<Cluster>> CLUSTERS = new ArrayList<>();
+
+ static final Color[] COLORS = { Color.red, Color.orange, Color.yellow, Color.green, Color.blue, Color.magenta,
+ Color.lightGray };
+
+ protected static final double T1 = 3.0;
+
+ protected static final double T2 = 2.8;
+
+ static double significance = 0.05;
+
+ protected static int res; // screen resolution
+
+ public DisplayClustering() {
+ initialize();
+ this.setTitle("Sample Data");
+ }
+
+ public void initialize() {
+ // Get screen resolution
+ res = Toolkit.getDefaultToolkit().getScreenResolution();
+
+ // Set Frame size in inches
+ this.setSize(SIZE * res, SIZE * res);
+ this.setVisible(true);
+ this.setTitle("Asymmetric Sample Data");
+
+ // Window listener to terminate program.
+ this.addWindowListener(new WindowAdapter() {
+ @Override
+ public void windowClosing(WindowEvent e) {
+ System.exit(0);
+ }
+ });
+ }
+
+ public static void main(String[] args) throws Exception {
+ RandomUtils.useTestSeed();
+ generateSamples();
+ new DisplayClustering();
+ }
+
+ // Override the paint() method
+ @Override
+ public void paint(Graphics g) {
+ Graphics2D g2 = (Graphics2D) g;
+ plotSampleData(g2);
+ plotSampleParameters(g2);
+ plotClusters(g2);
+ }
+
+ protected static void plotClusters(Graphics2D g2) {
+ int cx = CLUSTERS.size() - 1;
+ for (List<Cluster> clusters : CLUSTERS) {
+ g2.setStroke(new BasicStroke(cx == 0 ? 3 : 1));
+ g2.setColor(COLORS[Math.min(COLORS.length - 1, cx--)]);
+ for (Cluster cluster : clusters) {
+ plotEllipse(g2, cluster.getCenter(), cluster.getRadius().times(3));
+ }
+ }
+ }
+
+ protected static void plotSampleParameters(Graphics2D g2) {
+ Vector v = new DenseVector(2);
+ Vector dv = new DenseVector(2);
+ g2.setColor(Color.RED);
+ for (Vector param : SAMPLE_PARAMS) {
+ v.set(0, param.get(0));
+ v.set(1, param.get(1));
+ dv.set(0, param.get(2) * 3);
+ dv.set(1, param.get(3) * 3);
+ plotEllipse(g2, v, dv);
+ }
+ }
+
+ protected static void plotSampleData(Graphics2D g2) {
+ double sx = (double) res / DS;
+ g2.setTransform(AffineTransform.getScaleInstance(sx, sx));
+
+ // plot the axes
+ g2.setColor(Color.BLACK);
+ Vector dv = new DenseVector(2).assign(SIZE / 2.0);
+ plotRectangle(g2, new DenseVector(2).assign(2), dv);
+ plotRectangle(g2, new DenseVector(2).assign(-2), dv);
+
+ // plot the sample data
+ g2.setColor(Color.DARK_GRAY);
+ dv.assign(0.03);
+ for (VectorWritable v : SAMPLE_DATA) {
+ plotRectangle(g2, v.get(), dv);
+ }
+ }
+
+ /**
+ * This method plots points and colors them according to their cluster
+ * membership, rather than drawing ellipses.
+ *
+ * As of commit, this method is used only by K-means spectral clustering.
+ * Since the cluster assignments are set within the eigenspace of the data, it
+ * is not inherent that the original data cluster as they would in K-means:
+ * that is, as symmetric gaussian mixtures.
+ *
+ * Since Spectral K-Means uses K-Means to cluster the eigenspace data, the raw
+ * output is not directly usable. Rather, the cluster assignments from the raw
+ * output need to be transferred back to the original data. As such, this
+ * method will read the SequenceFile cluster results of K-means and transfer
+ * the cluster assignments to the original data, coloring them appropriately.
+ *
+ * @param g2
+ * @param data
+ */
+ protected static void plotClusteredSampleData(Graphics2D g2, Path data) {
+ double sx = (double) res / DS;
+ g2.setTransform(AffineTransform.getScaleInstance(sx, sx));
+
+ g2.setColor(Color.BLACK);
+ Vector dv = new DenseVector(2).assign(SIZE / 2.0);
+ plotRectangle(g2, new DenseVector(2).assign(2), dv);
+ plotRectangle(g2, new DenseVector(2).assign(-2), dv);
+
+ // plot the sample data, colored according to the cluster they belong to
+ dv.assign(0.03);
+
+ Path clusteredPointsPath = new Path(data, "clusteredPoints");
+ Path inputPath = new Path(clusteredPointsPath, "part-m-00000");
+ Map<Integer,Color> colors = new HashMap<>();
+ int point = 0;
+ for (Pair<IntWritable,WeightedVectorWritable> record : new SequenceFileIterable<IntWritable,WeightedVectorWritable>(
+ inputPath, new Configuration())) {
+ int clusterId = record.getFirst().get();
+ VectorWritable v = SAMPLE_DATA.get(point++);
+ Integer key = clusterId;
+ if (!colors.containsKey(key)) {
+ colors.put(key, COLORS[Math.min(COLORS.length - 1, colors.size())]);
+ }
+ plotClusteredRectangle(g2, v.get(), dv, colors.get(key));
+ }
+ }
+
+ /**
+ * Identical to plotRectangle(), but with the option of setting the color of
+ * the rectangle's stroke.
+ *
+ * NOTE: This should probably be refactored with plotRectangle() since most of
+ * the code here is direct copy/paste from that method.
+ *
+ * @param g2
+ * A Graphics2D context.
+ * @param v
+ * A vector for the rectangle's center.
+ * @param dv
+ * A vector for the rectangle's dimensions.
+ * @param color
+ * The color of the rectangle's stroke.
+ */
+ protected static void plotClusteredRectangle(Graphics2D g2, Vector v, Vector dv, Color color) {
+ double[] flip = {1, -1};
+ Vector v2 = v.times(new DenseVector(flip));
+ v2 = v2.minus(dv.divide(2));
+ int h = SIZE / 2;
+ double x = v2.get(0) + h;
+ double y = v2.get(1) + h;
+
+ g2.setStroke(new BasicStroke(1));
+ g2.setColor(color);
+ g2.draw(new Rectangle2D.Double(x * DS, y * DS, dv.get(0) * DS, dv.get(1) * DS));
+ }
+
+ /**
+ * Draw a rectangle on the graphics context
+ *
+ * @param g2
+ * a Graphics2D context
+ * @param v
+ * a Vector of rectangle center
+ * @param dv
+ * a Vector of rectangle dimensions
+ */
+ protected static void plotRectangle(Graphics2D g2, Vector v, Vector dv) {
+ double[] flip = {1, -1};
+ Vector v2 = v.times(new DenseVector(flip));
+ v2 = v2.minus(dv.divide(2));
+ int h = SIZE / 2;
+ double x = v2.get(0) + h;
+ double y = v2.get(1) + h;
+ g2.draw(new Rectangle2D.Double(x * DS, y * DS, dv.get(0) * DS, dv.get(1) * DS));
+ }
+
+ /**
+ * Draw an ellipse on the graphics context
+ *
+ * @param g2
+ * a Graphics2D context
+ * @param v
+ * a Vector of ellipse center
+ * @param dv
+ * a Vector of ellipse dimensions
+ */
+ protected static void plotEllipse(Graphics2D g2, Vector v, Vector dv) {
+ double[] flip = {1, -1};
+ Vector v2 = v.times(new DenseVector(flip));
+ v2 = v2.minus(dv.divide(2));
+ int h = SIZE / 2;
+ double x = v2.get(0) + h;
+ double y = v2.get(1) + h;
+ g2.draw(new Ellipse2D.Double(x * DS, y * DS, dv.get(0) * DS, dv.get(1) * DS));
+ }
+
+ protected static void generateSamples() {
+ generateSamples(500, 1, 1, 3);
+ generateSamples(300, 1, 0, 0.5);
+ generateSamples(300, 0, 2, 0.1);
+ }
+
+ protected static void generate2dSamples() {
+ generate2dSamples(500, 1, 1, 3, 1);
+ generate2dSamples(300, 1, 0, 0.5, 1);
+ generate2dSamples(300, 0, 2, 0.1, 0.5);
+ }
+
+ /**
+ * Generate random samples and add them to the sampleData
+ *
+ * @param num
+ * int number of samples to generate
+ * @param mx
+ * double x-value of the sample mean
+ * @param my
+ * double y-value of the sample mean
+ * @param sd
+ * double standard deviation of the samples
+ */
+ protected static void generateSamples(int num, double mx, double my, double sd) {
+ double[] params = {mx, my, sd, sd};
+ SAMPLE_PARAMS.add(new DenseVector(params));
+ log.info("Generating {} samples m=[{}, {}] sd={}", num, mx, my, sd);
+ for (int i = 0; i < num; i++) {
+ SAMPLE_DATA.add(new VectorWritable(new DenseVector(new double[] {UncommonDistributions.rNorm(mx, sd),
+ UncommonDistributions.rNorm(my, sd)})));
+ }
+ }
+
+ protected static void writeSampleData(Path output) throws IOException {
+ Configuration conf = new Configuration();
+ FileSystem fs = FileSystem.get(output.toUri(), conf);
+
+ try (SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, output, Text.class, VectorWritable.class)) {
+ int i = 0;
+ for (VectorWritable vw : SAMPLE_DATA) {
+ writer.append(new Text("sample_" + i++), vw);
+ }
+ }
+ }
+
+ protected static List<Cluster> readClustersWritable(Path clustersIn) {
+ List<Cluster> clusters = new ArrayList<>();
+ Configuration conf = new Configuration();
+ for (ClusterWritable value : new SequenceFileDirValueIterable<ClusterWritable>(clustersIn, PathType.LIST,
+ PathFilters.logsCRCFilter(), conf)) {
+ Cluster cluster = value.getValue();
+ log.info(
+ "Reading Cluster:{} center:{} numPoints:{} radius:{}",
+ cluster.getId(), AbstractCluster.formatVector(cluster.getCenter(), null),
+ cluster.getNumObservations(), AbstractCluster.formatVector(cluster.getRadius(), null));
+ clusters.add(cluster);
+ }
+ return clusters;
+ }
+
+ protected static void loadClustersWritable(Path output) throws IOException {
+ Configuration conf = new Configuration();
+ FileSystem fs = FileSystem.get(output.toUri(), conf);
+ for (FileStatus s : fs.listStatus(output, new ClustersFilter())) {
+ List<Cluster> clusters = readClustersWritable(s.getPath());
+ CLUSTERS.add(clusters);
+ }
+ }
+
+ /**
+ * Generate random samples and add them to the sampleData
+ *
+ * @param num
+ * int number of samples to generate
+ * @param mx
+ * double x-value of the sample mean
+ * @param my
+ * double y-value of the sample mean
+ * @param sdx
+ * double x-value standard deviation of the samples
+ * @param sdy
+ * double y-value standard deviation of the samples
+ */
+ protected static void generate2dSamples(int num, double mx, double my, double sdx, double sdy) {
+ double[] params = {mx, my, sdx, sdy};
+ SAMPLE_PARAMS.add(new DenseVector(params));
+ log.info("Generating {} samples m=[{}, {}] sd=[{}, {}]", num, mx, my, sdx, sdy);
+ for (int i = 0; i < num; i++) {
+ SAMPLE_DATA.add(new VectorWritable(new DenseVector(new double[] {UncommonDistributions.rNorm(mx, sdx),
+ UncommonDistributions.rNorm(my, sdy)})));
+ }
+ }
+
+ protected static boolean isSignificant(Cluster cluster) {
+ return (double) cluster.getNumObservations() / SAMPLE_DATA.size() > significance;
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/display/DisplayFuzzyKMeans.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/display/DisplayFuzzyKMeans.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/display/DisplayFuzzyKMeans.java
new file mode 100644
index 0000000..f8ce7c7
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/display/DisplayFuzzyKMeans.java
@@ -0,0 +1,110 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.display;
+
+import java.awt.Graphics;
+import java.awt.Graphics2D;
+import java.io.IOException;
+import java.util.Collection;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.mahout.clustering.Cluster;
+import org.apache.mahout.clustering.classify.ClusterClassifier;
+import org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansDriver;
+import org.apache.mahout.clustering.fuzzykmeans.SoftCluster;
+import org.apache.mahout.clustering.iterator.ClusterIterator;
+import org.apache.mahout.clustering.iterator.FuzzyKMeansClusteringPolicy;
+import org.apache.mahout.clustering.kmeans.RandomSeedGenerator;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.common.distance.ManhattanDistanceMeasure;
+import org.apache.mahout.math.Vector;
+
+import com.google.common.collect.Lists;
+
+public class DisplayFuzzyKMeans extends DisplayClustering {
+
+ DisplayFuzzyKMeans() {
+ initialize();
+ this.setTitle("Fuzzy k-Means Clusters (>" + (int) (significance * 100) + "% of population)");
+ }
+
+ // Override the paint() method
+ @Override
+ public void paint(Graphics g) {
+ plotSampleData((Graphics2D) g);
+ plotClusters((Graphics2D) g);
+ }
+
+ public static void main(String[] args) throws Exception {
+ DistanceMeasure measure = new ManhattanDistanceMeasure();
+
+ Path samples = new Path("samples");
+ Path output = new Path("output");
+ Configuration conf = new Configuration();
+ HadoopUtil.delete(conf, output);
+ HadoopUtil.delete(conf, samples);
+ RandomUtils.useTestSeed();
+ DisplayClustering.generateSamples();
+ writeSampleData(samples);
+ boolean runClusterer = true;
+ int maxIterations = 10;
+ float threshold = 0.001F;
+ float m = 1.1F;
+ if (runClusterer) {
+ runSequentialFuzzyKClusterer(conf, samples, output, measure, maxIterations, m, threshold);
+ } else {
+ int numClusters = 3;
+ runSequentialFuzzyKClassifier(conf, samples, output, measure, numClusters, maxIterations, m, threshold);
+ }
+ new DisplayFuzzyKMeans();
+ }
+
+ private static void runSequentialFuzzyKClassifier(Configuration conf, Path samples, Path output,
+ DistanceMeasure measure, int numClusters, int maxIterations, float m, double threshold) throws IOException {
+ Collection<Vector> points = Lists.newArrayList();
+ for (int i = 0; i < numClusters; i++) {
+ points.add(SAMPLE_DATA.get(i).get());
+ }
+ List<Cluster> initialClusters = Lists.newArrayList();
+ int id = 0;
+ for (Vector point : points) {
+ initialClusters.add(new SoftCluster(point, id++, measure));
+ }
+ ClusterClassifier prior = new ClusterClassifier(initialClusters, new FuzzyKMeansClusteringPolicy(m, threshold));
+ Path priorPath = new Path(output, "classifier-0");
+ prior.writeToSeqFiles(priorPath);
+
+ ClusterIterator.iterateSeq(conf, samples, priorPath, output, maxIterations);
+ loadClustersWritable(output);
+ }
+
+ private static void runSequentialFuzzyKClusterer(Configuration conf, Path samples, Path output,
+ DistanceMeasure measure, int maxIterations, float m, double threshold) throws IOException,
+ ClassNotFoundException, InterruptedException {
+ Path clustersIn = new Path(output, "random-seeds");
+ RandomSeedGenerator.buildRandom(conf, samples, clustersIn, 3, measure);
+ FuzzyKMeansDriver.run(samples, clustersIn, output, threshold, maxIterations, m, true, true, threshold,
+ true);
+
+ loadClustersWritable(output);
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/display/DisplayKMeans.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/display/DisplayKMeans.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/display/DisplayKMeans.java
new file mode 100644
index 0000000..336d69e
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/display/DisplayKMeans.java
@@ -0,0 +1,106 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.display;
+
+import java.awt.Graphics;
+import java.awt.Graphics2D;
+import java.io.IOException;
+import java.util.Collection;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.mahout.clustering.Cluster;
+import org.apache.mahout.clustering.classify.ClusterClassifier;
+import org.apache.mahout.clustering.iterator.ClusterIterator;
+import org.apache.mahout.clustering.iterator.KMeansClusteringPolicy;
+import org.apache.mahout.clustering.kmeans.KMeansDriver;
+import org.apache.mahout.clustering.kmeans.RandomSeedGenerator;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.common.distance.ManhattanDistanceMeasure;
+import org.apache.mahout.math.Vector;
+
+import com.google.common.collect.Lists;
+
+public class DisplayKMeans extends DisplayClustering {
+
+ DisplayKMeans() {
+ initialize();
+ this.setTitle("k-Means Clusters (>" + (int) (significance * 100) + "% of population)");
+ }
+
+ public static void main(String[] args) throws Exception {
+ DistanceMeasure measure = new ManhattanDistanceMeasure();
+ Path samples = new Path("samples");
+ Path output = new Path("output");
+ Configuration conf = new Configuration();
+ HadoopUtil.delete(conf, samples);
+ HadoopUtil.delete(conf, output);
+
+ RandomUtils.useTestSeed();
+ generateSamples();
+ writeSampleData(samples);
+ boolean runClusterer = true;
+ double convergenceDelta = 0.001;
+ int numClusters = 3;
+ int maxIterations = 10;
+ if (runClusterer) {
+ runSequentialKMeansClusterer(conf, samples, output, measure, numClusters, maxIterations, convergenceDelta);
+ } else {
+ runSequentialKMeansClassifier(conf, samples, output, measure, numClusters, maxIterations, convergenceDelta);
+ }
+ new DisplayKMeans();
+ }
+
+ private static void runSequentialKMeansClassifier(Configuration conf, Path samples, Path output,
+ DistanceMeasure measure, int numClusters, int maxIterations, double convergenceDelta) throws IOException {
+ Collection<Vector> points = Lists.newArrayList();
+ for (int i = 0; i < numClusters; i++) {
+ points.add(SAMPLE_DATA.get(i).get());
+ }
+ List<Cluster> initialClusters = Lists.newArrayList();
+ int id = 0;
+ for (Vector point : points) {
+ initialClusters.add(new org.apache.mahout.clustering.kmeans.Kluster(point, id++, measure));
+ }
+ ClusterClassifier prior = new ClusterClassifier(initialClusters, new KMeansClusteringPolicy(convergenceDelta));
+ Path priorPath = new Path(output, Cluster.INITIAL_CLUSTERS_DIR);
+ prior.writeToSeqFiles(priorPath);
+
+ ClusterIterator.iterateSeq(conf, samples, priorPath, output, maxIterations);
+ loadClustersWritable(output);
+ }
+
+ private static void runSequentialKMeansClusterer(Configuration conf, Path samples, Path output,
+ DistanceMeasure measure, int numClusters, int maxIterations, double convergenceDelta)
+ throws IOException, InterruptedException, ClassNotFoundException {
+ Path clustersIn = new Path(output, "random-seeds");
+ RandomSeedGenerator.buildRandom(conf, samples, clustersIn, numClusters, measure);
+ KMeansDriver.run(samples, clustersIn, output, convergenceDelta, maxIterations, true, 0.0, true);
+ loadClustersWritable(output);
+ }
+
+ // Override the paint() method
+ @Override
+ public void paint(Graphics g) {
+ plotSampleData((Graphics2D) g);
+ plotClusters((Graphics2D) g);
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/display/DisplaySpectralKMeans.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/display/DisplaySpectralKMeans.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/display/DisplaySpectralKMeans.java
new file mode 100644
index 0000000..2b70749
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/display/DisplaySpectralKMeans.java
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.display;
+
+import java.awt.Graphics;
+import java.awt.Graphics2D;
+import java.io.BufferedWriter;
+import java.io.FileWriter;
+import java.io.Writer;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.mahout.clustering.spectral.kmeans.SpectralKMeansDriver;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.common.distance.ManhattanDistanceMeasure;
+
+public class DisplaySpectralKMeans extends DisplayClustering {
+
+ protected static final String SAMPLES = "samples";
+ protected static final String OUTPUT = "output";
+ protected static final String TEMP = "tmp";
+ protected static final String AFFINITIES = "affinities";
+
+ DisplaySpectralKMeans() {
+ initialize();
+ setTitle("Spectral k-Means Clusters (>" + (int) (significance * 100) + "% of population)");
+ }
+
+ public static void main(String[] args) throws Exception {
+ DistanceMeasure measure = new ManhattanDistanceMeasure();
+ Path samples = new Path(SAMPLES);
+ Path output = new Path(OUTPUT);
+ Path tempDir = new Path(TEMP);
+ Configuration conf = new Configuration();
+ HadoopUtil.delete(conf, samples);
+ HadoopUtil.delete(conf, output);
+
+ RandomUtils.useTestSeed();
+ DisplayClustering.generateSamples();
+ writeSampleData(samples);
+ Path affinities = new Path(output, AFFINITIES);
+ FileSystem fs = FileSystem.get(output.toUri(), conf);
+ if (!fs.exists(output)) {
+ fs.mkdirs(output);
+ }
+
+ try (Writer writer = new BufferedWriter(new FileWriter(affinities.toString()))){
+ for (int i = 0; i < SAMPLE_DATA.size(); i++) {
+ for (int j = 0; j < SAMPLE_DATA.size(); j++) {
+ writer.write(i + "," + j + ',' + measure.distance(SAMPLE_DATA.get(i).get(),
+ SAMPLE_DATA.get(j).get()) + '\n');
+ }
+ }
+ }
+
+ int maxIter = 10;
+ double convergenceDelta = 0.001;
+ SpectralKMeansDriver.run(new Configuration(), affinities, output, SAMPLE_DATA.size(), 3, measure,
+ convergenceDelta, maxIter, tempDir);
+ new DisplaySpectralKMeans();
+ }
+
+ @Override
+ public void paint(Graphics g) {
+ plotClusteredSampleData((Graphics2D) g, new Path(new Path(OUTPUT), "kmeans_out"));
+ }
+}

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/display/README.txt
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/display/README.txt b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/display/README.txt
new file mode 100644
index 0000000..470c16c
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/display/README.txt
@@ -0,0 +1,22 @@
+The following classes can be run without parameters to generate a sample data set and
+run the reference clustering implementations over them:
+
+DisplayClustering - generates 1000 samples from three, symmetric distributions. This is the same
+ data set that is used by the following clustering programs. It displays the points on a screen
+ and superimposes the model parameters that were used to generate the points. You can edit the
+ generateSamples() method to change the sample points used by these programs.
+
+ * DisplayCanopy - uses Canopy clustering
+ * DisplayKMeans - uses k-Means clustering
+ * DisplayFuzzyKMeans - uses Fuzzy k-Means clustering
+
+ * NOTE: some of these programs display the sample points and then superimpose all of the clusters
+ from each iteration. The last iteration's clusters are in bold red and the previous several are
+ colored (orange, yellow, green, blue, violet) in order after which all earlier clusters are in
+ light grey. This helps to visualize how the clusters converge upon a solution over multiple
+ iterations.
+ * NOTE: by changing the parameter values (k, ALPHA_0, numIterations) and the display SIGNIFICANCE
+ you can obtain different results.
+
+
+
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/streaming/tools/ClusterQualitySummarizer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/streaming/tools/ClusterQualitySummarizer.java b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/streaming/tools/ClusterQualitySummarizer.java
new file mode 100644
index 0000000..c29cbc4
--- /dev/null
+++ b/community/mahout-mr/mr-examples/src/main/java/org/apache/mahout/clustering/streaming/tools/ClusterQualitySummarizer.java
@@ -0,0 +1,279 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.streaming.tools;
+
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.util.List;
+
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Lists;
+import com.google.common.io.Closeables;
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.commandline.Parser;
+import org.apache.commons.cli2.util.HelpFormatter;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.mahout.clustering.iterator.ClusterWritable;
+import org.apache.mahout.clustering.ClusteringUtils;
+import org.apache.mahout.clustering.streaming.mapreduce.CentroidWritable;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.distance.DistanceMeasure;
+import org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure;
+import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterable;
+import org.apache.mahout.math.Centroid;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+import org.apache.mahout.math.stats.OnlineSummarizer;
+
+public class ClusterQualitySummarizer extends AbstractJob {
+ private String outputFile;
+
+ private PrintWriter fileOut;
+
+ private String trainFile;
+ private String testFile;
+ private String centroidFile;
+ private String centroidCompareFile;
+ private boolean mahoutKMeansFormat;
+ private boolean mahoutKMeansFormatCompare;
+
+ private DistanceMeasure distanceMeasure = new SquaredEuclideanDistanceMeasure();
+
+ public void printSummaries(List<OnlineSummarizer> summarizers, String type) {
+ printSummaries(summarizers, type, fileOut);
+ }
+
+ public static void printSummaries(List<OnlineSummarizer> summarizers, String type, PrintWriter fileOut) {
+ double maxDistance = 0;
+ for (int i = 0; i < summarizers.size(); ++i) {
+ OnlineSummarizer summarizer = summarizers.get(i);
+ if (summarizer.getCount() > 1) {
+ maxDistance = Math.max(maxDistance, summarizer.getMax());
+ System.out.printf("Average distance in cluster %d [%d]: %f\n", i, summarizer.getCount(), summarizer.getMean());
+ // If there is just one point in the cluster, quartiles cannot be estimated. We'll just assume all the quartiles
+ // equal the only value.
+ if (fileOut != null) {
+ fileOut.printf("%d,%f,%f,%f,%f,%f,%f,%f,%d,%s\n", i, summarizer.getMean(),
+ summarizer.getSD(),
+ summarizer.getQuartile(0),
+ summarizer.getQuartile(1),
+ summarizer.getQuartile(2),
+ summarizer.getQuartile(3),
+ summarizer.getQuartile(4), summarizer.getCount(), type);
+ }
+ } else {
+ System.out.printf("Cluster %d is has %d data point. Need atleast 2 data points in a cluster for" +
+ " OnlineSummarizer.\n", i, summarizer.getCount());
+ }
+ }
+ System.out.printf("Num clusters: %d; maxDistance: %f\n", summarizers.size(), maxDistance);
+ }
+
+ public int run(String[] args) throws IOException {
+ if (!parseArgs(args)) {
+ return -1;
+ }
+
+ Configuration conf = new Configuration();
+ try {
+ fileOut = new PrintWriter(new FileOutputStream(outputFile));
+ fileOut.printf("cluster,distance.mean,distance.sd,distance.q0,distance.q1,distance.q2,distance.q3,"
+ + "distance.q4,count,is.train\n");
+
+ // Reading in the centroids (both pairs, if they exist).
+ List<Centroid> centroids;
+ List<Centroid> centroidsCompare = null;
+ if (mahoutKMeansFormat) {
+ SequenceFileDirValueIterable<ClusterWritable> clusterIterable =
+ new SequenceFileDirValueIterable<>(new Path(centroidFile), PathType.GLOB, conf);
+ centroids = Lists.newArrayList(IOUtils.getCentroidsFromClusterWritableIterable(clusterIterable));
+ } else {
+ SequenceFileDirValueIterable<CentroidWritable> centroidIterable =
+ new SequenceFileDirValueIterable<>(new Path(centroidFile), PathType.GLOB, conf);
+ centroids = Lists.newArrayList(IOUtils.getCentroidsFromCentroidWritableIterable(centroidIterable));
+ }
+
+ if (centroidCompareFile != null) {
+ if (mahoutKMeansFormatCompare) {
+ SequenceFileDirValueIterable<ClusterWritable> clusterCompareIterable =
+ new SequenceFileDirValueIterable<>(new Path(centroidCompareFile), PathType.GLOB, conf);
+ centroidsCompare = Lists.newArrayList(
+ IOUtils.getCentroidsFromClusterWritableIterable(clusterCompareIterable));
+ } else {
+ SequenceFileDirValueIterable<CentroidWritable> centroidCompareIterable =
+ new SequenceFileDirValueIterable<>(new Path(centroidCompareFile), PathType.GLOB, conf);
+ centroidsCompare = Lists.newArrayList(
+ IOUtils.getCentroidsFromCentroidWritableIterable(centroidCompareIterable));
+ }
+ }
+
+ // Reading in the "training" set.
+ SequenceFileDirValueIterable<VectorWritable> trainIterable =
+ new SequenceFileDirValueIterable<>(new Path(trainFile), PathType.GLOB, conf);
+ Iterable<Vector> trainDatapoints = IOUtils.getVectorsFromVectorWritableIterable(trainIterable);
+ Iterable<Vector> datapoints = trainDatapoints;
+
+ printSummaries(ClusteringUtils.summarizeClusterDistances(trainDatapoints, centroids,
+ new SquaredEuclideanDistanceMeasure()), "train");
+
+ // Also adding in the "test" set.
+ if (testFile != null) {
+ SequenceFileDirValueIterable<VectorWritable> testIterable =
+ new SequenceFileDirValueIterable<>(new Path(testFile), PathType.GLOB, conf);
+ Iterable<Vector> testDatapoints = IOUtils.getVectorsFromVectorWritableIterable(testIterable);
+
+ printSummaries(ClusteringUtils.summarizeClusterDistances(testDatapoints, centroids,
+ new SquaredEuclideanDistanceMeasure()), "test");
+
+ datapoints = Iterables.concat(trainDatapoints, testDatapoints);
+ }
+
+ // At this point, all train/test CSVs have been written. We now compute quality metrics.
+ List<OnlineSummarizer> summaries =
+ ClusteringUtils.summarizeClusterDistances(datapoints, centroids, distanceMeasure);
+ List<OnlineSummarizer> compareSummaries = null;
+ if (centroidsCompare != null) {
+ compareSummaries = ClusteringUtils.summarizeClusterDistances(datapoints, centroidsCompare, distanceMeasure);
+ }
+ System.out.printf("[Dunn Index] First: %f", ClusteringUtils.dunnIndex(centroids, distanceMeasure, summaries));
+ if (compareSummaries != null) {
+ System.out.printf(" Second: %f\n", ClusteringUtils.dunnIndex(centroidsCompare, distanceMeasure, compareSummaries));
+ } else {
+ System.out.printf("\n");
+ }
+ System.out.printf("[Davies-Bouldin Index] First: %f",
+ ClusteringUtils.daviesBouldinIndex(centroids, distanceMeasure, summaries));
+ if (compareSummaries != null) {
+ System.out.printf(" Second: %f\n",
+ ClusteringUtils.daviesBouldinIndex(centroidsCompare, distanceMeasure, compareSummaries));
+ } else {
+ System.out.printf("\n");
+ }
+ } catch (IOException e) {
+ System.out.println(e.getMessage());
+ } finally {
+ Closeables.close(fileOut, false);
+ }
+ return 0;
+ }
+
+ private boolean parseArgs(String[] args) {
+ DefaultOptionBuilder builder = new DefaultOptionBuilder();
+
+ Option help = builder.withLongName("help").withDescription("print this list").create();
+
+ ArgumentBuilder argumentBuilder = new ArgumentBuilder();
+ Option inputFileOption = builder.withLongName("input")
+ .withShortName("i")
+ .withRequired(true)
+ .withArgument(argumentBuilder.withName("input").withMaximum(1).create())
+ .withDescription("where to get seq files with the vectors (training set)")
+ .create();
+
+ Option testInputFileOption = builder.withLongName("testInput")
+ .withShortName("itest")
+ .withArgument(argumentBuilder.withName("testInput").withMaximum(1).create())
+ .withDescription("where to get seq files with the vectors (test set)")
+ .create();
+
+ Option centroidsFileOption = builder.withLongName("centroids")
+ .withShortName("c")
+ .withRequired(true)
+ .withArgument(argumentBuilder.withName("centroids").withMaximum(1).create())
+ .withDescription("where to get seq files with the centroids (from Mahout KMeans or StreamingKMeansDriver)")
+ .create();
+
+ Option centroidsCompareFileOption = builder.withLongName("centroidsCompare")
+ .withShortName("cc")
+ .withRequired(false)
+ .withArgument(argumentBuilder.withName("centroidsCompare").withMaximum(1).create())
+ .withDescription("where to get seq files with the second set of centroids (from Mahout KMeans or "
+ + "StreamingKMeansDriver)")
+ .create();
+
+ Option outputFileOption = builder.withLongName("output")
+ .withShortName("o")
+ .withRequired(true)
+ .withArgument(argumentBuilder.withName("output").withMaximum(1).create())
+ .withDescription("where to dump the CSV file with the results")
+ .create();
+
+ Option mahoutKMeansFormatOption = builder.withLongName("mahoutkmeansformat")
+ .withShortName("mkm")
+ .withDescription("if set, read files as (IntWritable, ClusterWritable) pairs")
+ .withArgument(argumentBuilder.withName("numpoints").withMaximum(1).create())
+ .create();
+
+ Option mahoutKMeansCompareFormatOption = builder.withLongName("mahoutkmeansformatCompare")
+ .withShortName("mkmc")
+ .withDescription("if set, read files as (IntWritable, ClusterWritable) pairs")
+ .withArgument(argumentBuilder.withName("numpoints").withMaximum(1).create())
+ .create();
+
+ Group normalArgs = new GroupBuilder()
+ .withOption(help)
+ .withOption(inputFileOption)
+ .withOption(testInputFileOption)
+ .withOption(outputFileOption)
+ .withOption(centroidsFileOption)
+ .withOption(centroidsCompareFileOption)
+ .withOption(mahoutKMeansFormatOption)
+ .withOption(mahoutKMeansCompareFormatOption)
+ .create();
+
+ Parser parser = new Parser();
+ parser.setHelpOption(help);
+ parser.setHelpTrigger("--help");
+ parser.setGroup(normalArgs);
+ parser.setHelpFormatter(new HelpFormatter(" ", "", " ", 150));
+
+ CommandLine cmdLine = parser.parseAndHelp(args);
+ if (cmdLine == null) {
+ return false;
+ }
+
+ trainFile = (String) cmdLine.getValue(inputFileOption);
+ if (cmdLine.hasOption(testInputFileOption)) {
+ testFile = (String) cmdLine.getValue(testInputFileOption);
+ }
+ centroidFile = (String) cmdLine.getValue(centroidsFileOption);
+ if (cmdLine.hasOption(centroidsCompareFileOption)) {
+ centroidCompareFile = (String) cmdLine.getValue(centroidsCompareFileOption);
+ }
+ outputFile = (String) cmdLine.getValue(outputFileOption);
+ if (cmdLine.hasOption(mahoutKMeansFormatOption)) {
+ mahoutKMeansFormat = true;
+ }
+ if (cmdLine.hasOption(mahoutKMeansCompareFormatOption)) {
+ mahoutKMeansFormatCompare = true;
+ }
+ return true;
+ }
+
+ public static void main(String[] args) throws IOException {
+ new ClusterQualitySummarizer().run(args);
+ }
+}

Loading...