http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/358cfc9f/alg-ref/SystemML_Algorithms_Reference.tex ---------------------------------------------------------------------- diff --git a/alg-ref/SystemML_Algorithms_Reference.tex b/alg-ref/SystemML_Algorithms_Reference.tex new file mode 100644 index 0000000..75308c9 --- /dev/null +++ b/alg-ref/SystemML_Algorithms_Reference.tex @@ -0,0 +1,174 @@ +\begin{comment} + + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. + +\end{comment} + +\documentclass[letter]{article} +\usepackage{graphicx,amsmath,amssymb,amsthm,subfigure,color,url,multirow,rotating,comment} +\usepackage{tikz} +\usepackage[normalem]{ulem} +\usepackage[np,autolanguage]{numprint} +\usepackage{tabularx} + +\usepackage[pdftex]{hyperref} +\hypersetup{ + unicode=false, % non-Latin characters in Acrobat’s bookmarks + pdftoolbar=true, % show Acrobat’s toolbar? + pdfmenubar=true, % show Acrobat’s menu? + pdffitwindow=true, % window fit to page when opened + pdfstartview={FitV}, % fits the width of the page to the window + pdftitle={SystemML Algorithms Reference}, % title + pdfauthor={SystemML Team}, % author + pdfsubject={Documentation}, % subject of the document + pdfkeywords={}, % list of keywords + pdfnewwindow=true, % links in new window + bookmarksnumbered=true, % put section numbers in bookmarks + bookmarksopen=true, % open up bookmark tree + bookmarksopenlevel=1, % \maxdimen level to which bookmarks are open + colorlinks=true, % false: boxed links; true: colored links + linkcolor=black, % color of internal links + citecolor=blue, % color of links to bibliography + filecolor=black, % color of file links + urlcolor=black % color of external links +} + + +\newtheorem{definition}{Definition} +\newtheorem{example}{Example} + +\newcommand{\Paragraph}[1]{\vspace*{1ex} \noindent {\bf #1} \hspace*{1ex}} +\newenvironment{Itemize}{\vspace{-0.5ex}\begin{itemize}\setlength{\itemsep}{-0.2ex} +}{\end{itemize}\vspace{-0.5ex}} +\newenvironment{Enumerate}{\vspace{-0.5ex}\begin{enumerate}\setlength{\itemsep}{-0.2ex} +}{\end{enumerate}\vspace{-0.5ex}} +\newenvironment{Description}{\vspace{-0.5ex}\begin{description}\setlength{\itemsep}{-0.2ex} +}{\end{description}\vspace{-0.5ex}} + + +\newcommand{\SystemML}{\texttt{SystemML} } +\newcommand{\hml}{\texttt{hadoop jar SystemML.jar} } +\newcommand{\pxp}{\mathbin{\texttt{\%\textasteriskcentered\%}}} +\newcommand{\todo}[1]{{{\color{red}TODO: #1}}} +\newcommand{\Normal}{\ensuremath{\mathop{\mathrm{Normal}}\nolimits}} +\newcommand{\Prob}{\ensuremath{\mathop{\mathrm{Prob}\hspace{0.5pt}}\nolimits}} +\newcommand{\E}{\ensuremath{\mathop{\mathrm{E}}\nolimits}} +\newcommand{\mean}{\ensuremath{\mathop{\mathrm{mean}}\nolimits}} +\newcommand{\Var}{\ensuremath{\mathop{\mathrm{Var}}\nolimits}} +\newcommand{\Cov}{\ensuremath{\mathop{\mathrm{Cov}}\nolimits}} +\newcommand{\stdev}{\ensuremath{\mathop{\mathrm{st.dev}}\nolimits}} +\newcommand{\atan}{\ensuremath{\mathop{\mathrm{arctan}}\nolimits}} +\newcommand{\diag}{\ensuremath{\mathop{\mathrm{diag}}\nolimits}} +\newcommand{\const}{\ensuremath{\mathop{\mathrm{const}}\nolimits}} +\newcommand{\eps}{\varepsilon} + +\sloppy + +%%%%%%%%%%%%%%%%%%%%% +% header +%%%%%%%%%%%%%%%%%%%%% + +\title{\LARGE{{\SystemML Algorithms Reference}}} +\date{\today} + +%%%%%%%%%%%%%%%%%%%%% +% document start +%%%%%%%%%%%%%%%%%%%%% +\begin{document} + +%\pagenumbering{roman} +\maketitle + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\section{Descriptive Statistics} +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +\input{DescriptiveStats} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\section{Classification} +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +\input{LogReg} + +\subsection{Support Vector Machines} + +\input{BinarySVM} + +\input{MultiSVM} + +\input{NaiveBayes} + +\input{DecisionTrees} + +\input{RandomForest} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\section{Clustering} +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +\input{Kmeans} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\section{Regression} +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +\input{LinReg} + +\input{StepLinRegDS} + +\input{GLM} + +\input{StepGLM} + +\input{GLMpredict.tex} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\section{Matrix Factorization} +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +\input{pca} + +\input{ALS.tex} + +%%{\color{red}\subsection{GNMF}} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%%{\color{red}\section{Sequence Mining}} +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\section{Survival Analysis} +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\input{KaplanMeier} + +\input{Cox} + +\bibliographystyle{abbrv} + +\bibliography{SystemML_ALgorithms_Reference} + + +%%%%%%%%%%%%%%%%%%%%% +% document end +%%%%%%%%%%%%%%%%%%%%% +\end{document} + +
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/358cfc9f/lang-ref/PyDML_Language_Reference.doc ---------------------------------------------------------------------- diff --git a/lang-ref/PyDML_Language_Reference.doc b/lang-ref/PyDML_Language_Reference.doc new file mode 100644 index 0000000..b43b6db Binary files /dev/null and b/lang-ref/PyDML_Language_Reference.doc differ http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/358cfc9f/lang-ref/Python_syntax_for_DML.doc ---------------------------------------------------------------------- diff --git a/lang-ref/Python_syntax_for_DML.doc b/lang-ref/Python_syntax_for_DML.doc new file mode 100644 index 0000000..ee43a6b Binary files /dev/null and b/lang-ref/Python_syntax_for_DML.doc differ http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/358cfc9f/lang-ref/README_HADOOP_CONFIG.txt ---------------------------------------------------------------------- diff --git a/lang-ref/README_HADOOP_CONFIG.txt b/lang-ref/README_HADOOP_CONFIG.txt new file mode 100644 index 0000000..e34d4f3 --- /dev/null +++ b/lang-ref/README_HADOOP_CONFIG.txt @@ -0,0 +1,83 @@ +Usage +----- +The machine learning algorithms described in SystemML_Algorithms_Reference.pdf can be invoked +from the hadoop command line using the described, algorithm-specific parameters. + +Generic command line arguments arguments are provided by the help command below. + + hadoop jar SystemML.jar -? or -help + + +Recommended configurations +-------------------------- +1) JVM Heap Sizes: +We recommend an equal-sized JVM configuration for clients, mappers, and reducers. For the client +process this can be done via + + export HADOOP_CLIENT_OPTS="-Xmx2048m -Xms2048m -Xmn256m" + +where Xmx specifies the maximum heap size, Xms the initial heap size, and Xmn is size of the young +generation. For Xmn values of equal or less than 15% of the max heap size, we guarantee the memory budget. + +For mapper or reducer JVM configurations, the following properties can be specified in mapred-site.xml, +where 'child' refers to both mapper and reducer. If map and reduce are specified individually, they take +precedence over the generic property. + + <property> + <name>mapreduce.child.java.opts</name> <!-- synonym: mapred.child.java.opts --> + <value>-Xmx2048m -Xms2048m -Xmn256m</value> + </property> + <property> + <name>mapreduce.map.java.opts</name> <!-- synonym: mapred.map.java.opts --> + <value>-Xmx2048m -Xms2048m -Xmn256m</value> + </property> + <property> + <name>mapreduce.reduce.java.opts</name> <!-- synonym: mapred.reduce.java.opts --> + <value>-Xmx2048m -Xms2048m -Xmn256m</value> + </property> + + +2) CP Memory Limitation: +There exist size limitations for in-memory matrices. Dense in-memory matrices are limited to 16GB +independent of their dimension. Sparse in-memory matrices are limited to 2G rows and 2G columns +but the overall matrix can be larger. These limitations do only apply to in-memory matrices but +NOT in HDFS or involved in MR computations. Setting HADOOP_CLIENT_OPTS below those limitations +prevents runtime errors. + +3) Transparent Huge Pages (on Red Hat Enterprise Linux 6): +Hadoop workloads might show very high System CPU utilization if THP is enabled. In case of such +behavior, we recommend to disable THP with + + echo never > /sys/kernel/mm/redhat_transparent_hugepage/enabled + +4) JVM Reuse: +Performance benefits from JVM reuse because data sets that fit into the mapper memory budget are +reused across tasks per slot. However, Hadoop 1.0.3 JVM Reuse is incompatible with security (when +using the LinuxTaskController). The workaround is to use the DefaultTaskController. SystemML provides +a configuration property in SystemML-config.xml to enable JVM reuse on a per job level without +changing the global cluster configuration. + + <jvmreuse>false</jvmreuse> + +5) Number of Reducers: +The number of reducers can have significant impact on performance. SystemML provides a configuration +property to set the default number of reducers per job without changing the global cluster configuration. +In general, we recommend a setting of twice the number of nodes. Smaller numbers create less intermediate +files, larger numbers increase the degree of parallelism for compute and parallel write. In +SystemML-config.xml, set: + + <!-- default number of reduce tasks per MR job, default: 2 x number of nodes --> + <numreducers>12</numreducers> + +6) SystemML temporary directories: +SystemML uses temporary directories in two different locations: (1) on local file system for temping from +the client process, and (2) on HDFS for intermediate results between different MR jobs and between MR jobs +and in-memory operations. Locations of these directories can be configured in SystemML-config.xml with the +following properties: + + <!-- local fs tmp working directory--> + <localtmpdir>/tmp/systemml</localtmpdir> + + <!-- hdfs tmp working directory--> + <scratch>scratch_space</scratch> + \ No newline at end of file