diff --git a/.classpath b/.classpath
new file mode 100755
index 00000000..cf17507c
--- /dev/null
+++ b/.classpath
@@ -0,0 +1,16 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/.project b/.project
new file mode 100755
index 00000000..4df1a6f3
--- /dev/null
+++ b/.project
@@ -0,0 +1,18 @@
+
+
+ BIDMach
+
+
+
+
+
+ org.scala-ide.sdt.core.scalabuilder
+
+
+
+
+
+ org.scala-ide.sdt.core.scalanature
+ org.eclipse.jdt.core.javanature
+
+
diff --git a/BIDMach.jar b/BIDMach.jar
new file mode 100755
index 00000000..0cf5e46f
Binary files /dev/null and b/BIDMach.jar differ
diff --git a/Copyright.txt b/Copyright.txt
new file mode 100755
index 00000000..21326596
--- /dev/null
+++ b/Copyright.txt
@@ -0,0 +1,25 @@
+Copyright (c) 2012, Regents of the University of California
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
diff --git a/bidmach b/bidmach
new file mode 100755
index 00000000..d0948549
--- /dev/null
+++ b/bidmach
@@ -0,0 +1,37 @@
+#!/bin/bash
+export JAVA_OPTS="-Xmx12G -Xms128M" # Set as much memory as possible
+BIDMACH_ROOT="${BASH_SOURCE[0]}"
+if [ ! `uname` = "Darwin" ]; then
+ BIDMACH_ROOT=`readlink -f "${BIDMACH_ROOT}"`
+else
+ BIDMACH_ROOT=`readlink "${BIDMACH_ROOT}"`
+fi
+BIDMACH_ROOT=`dirname "$BIDMACH_ROOT"`
+BIDMACH_ROOT="$( echo ${BIDMACH_ROOT} | sed s+/cygdrive/c+c:+ )"
+BIDMAT_ROOT="${BIDMACH_ROOT}/../BIDMat" # Change if needed
+# export JAVA_HOME="" # Set here if not set in environment
+# Fix these if needed
+JCUDA_VERSION="0.5.0"
+JCUDA_LIBDIR="${BIDMAT_ROOT}/lib"
+BIDLIB="${BIDMAT_ROOT}/lib"
+LIBDIR=${BIDMACH_ROOT}/lib
+if [ `uname` = "Darwin" ]; then
+ export DYLD_LIBRARY_PATH="${BIDMAT_ROOT}/lib:/usr/local/cuda/lib:${LD_LIBRARY_PATH}"
+else
+ export LD_LIBRARY_PATH="${BIDMAT_ROOT}/lib:/usr/local/cuda/lib64:${LD_LIBRARY_PATH}"
+fi
+
+BIDMAT_LIBS="${BIDMAT_ROOT}/BIDMat.jar;${BIDLIB}/ptplot.jar;${BIDLIB}/ptplotapplication.jar;${BIDLIB}/jhdf5.jar;${BIDLIB}/commons-math3-3.1.1.jar;${BIDLIB}/lz4-1.1.2.jar"
+JCUDA_LIBS="${JCUDA_LIBDIR}/jcuda-${JCUDA_VERSION}.jar;${JCUDA_LIBDIR}/jcublas-${JCUDA_VERSION}.jar;${JCUDA_LIBDIR}/jcufft-${JCUDA_VERSION}.jar;${JCUDA_LIBDIR}/jcurand-${JCUDA_VERSION}.jar;${JCUDA_LIBDIR}/jcusparse-${JCUDA_VERSION}.jar"
+
+export ALL_LIBS="${BIDMACH_ROOT}/BIDMach.jar;${BIDMAT_LIBS};${JCUDA_LIBS};${JAVA_HOME}/lib/tools.jar"
+
+if [ ! "$OS" = "Windows_NT" ]; then
+ export ALL_LIBS=`echo "${ALL_LIBS}" | sed 's/;/:/g'`
+else
+ NEWPATH=`${BIDMAT_ROOT}/shortpath.bat "${CUDA_BIN_PATH}"`
+ NEWPATH=`echo $NEWPATH | sed 's_\\\\_/_g'`
+ JAVA_OPTS="-Djava.library.path=${BIDMAT_ROOT}/lib;${NEWPATH} "$JAVA_OPTS
+fi
+
+scala -nobootcp -cp "${ALL_LIBS}" -Yrepl-sync -i ${LIBDIR}/bidmach_init.scala
\ No newline at end of file
diff --git a/bidmach.cmd b/bidmach.cmd
new file mode 100755
index 00000000..d904faf3
--- /dev/null
+++ b/bidmach.cmd
@@ -0,0 +1,20 @@
+@ECHO OFF
+:: Set JAVA_HOME here if not set in environment
+:: SET JAVA_HOME=
+:: Set as much memory as possible
+(SET JAVA_OPTS=-Xmx12G -Xms128M)
+:: Fix these if needed
+SET JCUDA_VERSION=0.5.0
+SET BIDLIB="%CD%\..\BIDMat\lib"
+SET LIBDIR="%CD%\lib"
+SET JCUDA_LIBDIR=%BIDLIB%
+SET PATH=%BIDLIB%;%PATH%
+
+SET BIDMACH_LIBS=%BIDLIB%\..\BIDMat.jar;%CD%\BIDMach.jar;%BIDLIB%\ptplot.jar;%BIDLIB%\ptplotapplication.jar;%BIDLIB%\jhdf5.jar;%BIDLIB%\commons-math3-3.1.1.jar;%BIDLIB%\lz4-1.1.2.jar
+
+SET JCUDA_LIBS=%JCUDA_LIBDIR%\jcuda-%JCUDA_VERSION%.jar;%JCUDA_LIBDIR%\jcublas-%JCUDA_VERSION%.jar;%JCUDA_LIBDIR%\jcufft-%JCUDA_VERSION%.jar;%JCUDA_LIBDIR%\jcurand-%JCUDA_VERSION%.jar;%JCUDA_LIBDIR%\jcusparse-%JCUDA_VERSION%.jar
+
+SET ALL_LIBS=%BIDMACH_LIBS%;%JCUDA_LIBS%;%JAVA_HOME%\lib\tools.jar
+:: echo %ALL_LIBS%
+
+scala -nobootcp -cp "%ALL_LIBS%" -Yrepl-sync -i %LIBDIR%\bidmach_init.scala
\ No newline at end of file
diff --git a/build.sbt b/build.sbt
new file mode 100755
index 00000000..9aecc3b1
--- /dev/null
+++ b/build.sbt
@@ -0,0 +1,40 @@
+
+name := "BIDMach"
+
+version := "0.1.0"
+
+organization := "edu.berkeley.bid"
+
+scalaVersion := "2.9.2"
+
+resolvers ++= Seq(
+ "Scala Tools Snapshots" at "http://scala-tools.org/repo-snapshots/",
+ "Scala Mirror" at "https://oss.sonatype.org/content/repositories/releases/"
+)
+
+libraryDependencies <<= (scalaVersion, libraryDependencies) { (sv, deps) =>
+ deps :+ ("org.scala-lang" % "scala-compiler" % sv)
+}
+
+libraryDependencies += "org.scala-lang" % "jline" % "2.9.2"
+
+libraryDependencies += "org.scalatest" %% "scalatest" % "1.8" % "test"
+
+libraryDependencies += "org.scalacheck" %% "scalacheck" % "1.9" % "test"
+
+libraryDependencies += "junit" % "junit" % "4.5" % "test"
+
+credentials += Credentials(Path.userHome / ".ivy2" / ".credentials")
+
+javacOptions ++= Seq("-source", "1.5", "-target", "1.5")
+
+scalacOptions ++= Seq("-deprecation","-target:jvm-1.5")
+
+initialCommands := scala.io.Source.fromFile("lib/bidmach_init.scala").getLines.mkString("\n")
+
+javaOptions += "-Xmx12g"
+
+//seq(ProguardPlugin.proguardSettings :_*)
+
+
+
diff --git a/lib/bidmach_init.scala b/lib/bidmach_init.scala
new file mode 100755
index 00000000..d11e2758
--- /dev/null
+++ b/lib/bidmach_init.scala
@@ -0,0 +1,10 @@
+import BIDMat.{BMat,CMat,CSMat,DMat,Dict,IDict,FMat,GMat,GIMat,GSMat,HMat,IMat,Mat,SMat,SDMat}
+import BIDMat.MatFunctions._
+import BIDMat.SciFunctions._
+import BIDMat.Solvers._
+import BIDMat.Plotting._
+import BIDMach.{MatDataSource,FilesDataSource,SFilesDataSource,Learner,LDAModel,NMFModel}
+
+Mat.checkMKL
+Mat.checkCUDA
+
diff --git a/project/plugins.sbt b/project/plugins.sbt
new file mode 100755
index 00000000..bf5cb709
--- /dev/null
+++ b/project/plugins.sbt
@@ -0,0 +1,7 @@
+
+libraryDependencies <+= sbtVersion(v => "com.github.siasia" %% "xsbt-proguard-plugin" % (v+"-0.1.1"))
+
+resolvers += "Proguard plugin repo" at "http://siasia.github.com/maven2"
+
+
+
diff --git a/src/main/scala/BIDMach/Clustering.scala b/src/main/scala/BIDMach/Clustering.scala
new file mode 100755
index 00000000..4e7e6385
--- /dev/null
+++ b/src/main/scala/BIDMach/Clustering.scala
@@ -0,0 +1,372 @@
+package BIDMach
+import BIDMat.{Mat,BMat,CMat,DMat,FMat,IMat,HMat,GMat,GIMat,GSMat,SMat,SDMat}
+import BIDMat.MatFunctions._
+import BIDMat.SciFunctions._
+
+
+class PAMmodel(opts:PAMmodel.Options = new PAMmodel.Options) {
+
+
+ var a:FMat = null
+ var nfeats = 0
+ var nsamps = 0
+ var ncenters = 0
+ var ntrys = 0
+ val options = opts
+ var maxdepth = 0
+ var nspills = 0
+ var bestc:IMat = null
+ var imin:IMat = null
+ var vdists:DMat = null
+ var sil:DMat = null
+ var mss = 0.0
+
+ var ncache = 0
+
+
+ def dists(x:FMat) = options.metric.dists(x:FMat)
+ def dists(x:FMat, y:FMat) = options.metric.dists(x:FMat, y:FMat)
+
+ def init(a0:FMat) = {
+
+ a = a0
+ nfeats = size(a0,2)
+ nsamps = size(a0,1)
+ ncenters = options.ncenters
+ ntrys = options.ntrys
+ vdists = dzeros(nsamps,1)
+ imin = izeros(nsamps,1)
+ sil = dzeros(nsamps,1)
+
+ ncache = min(nsamps,options.cbal*nsamps/ncenters)(0,0)
+
+ }
+
+
+ // Silhouette Score
+ def silhouetteScore(ds:FMat, iss:IMat, isamps:IMat, icenters:IMat, lab:IMat):DMat = {
+
+ var aa = zeros(nsamps,1)
+ var bb = zeros(nsamps,1)
+
+ var silhouette = zeros(nsamps,1)
+
+ var dimclu = zeros(length(icenters),1) // Size of each cluster
+ for( i <- 0 to nsamps-1 ) { dimclu(lab(i))+=1 }
+
+ //val ncache = size(iss,1)
+ val centmap = accum(icenters, icol(1 to length(icenters)), size(ds,2), 1)
+ var i = 0
+ while (i < length(isamps)) {
+
+ val ii = isamps(i)
+ val labi = lab(i)
+
+ var maxsize = dimclu(labi) + 1
+
+ var first = true
+ var j = 0
+ var count = 0
+
+ while (j < ncache && count < maxsize) {
+
+
+ if(lab(j)==labi){
+ aa(i)+=ds(j, ii)
+ }else { // pick the second closest center
+
+ if(first){
+
+ val labj = lab(j)
+ maxsize += dimclu(labj) - 1
+ first = false
+
+
+ }
+ bb(i)+=ds(j, ii)
+ }
+
+ j += 1
+ }
+ silhouette(i) = (bb(i) - aa(i)) / max(aa(i),bb(i))
+ i += 1
+ }
+
+ silhouette
+
+
+ }
+
+
+ def mindists(ds:FMat, iss:IMat, isamps:IMat, icenters:IMat, vmin:DMat, imin:IMat) = {
+
+ val centmap = accum(icenters, icol(1 to length(icenters)), size(ds,2), 1)
+ var i = 0
+ var ispills = izeros(1,nsamps)
+
+ var spills = 0
+
+ while (i < length(isamps)) {
+ val ii = isamps(i)
+ var continue = true
+ var j = 0
+ while (j < ncache && continue) {
+
+ if (centmap(iss(j, ii)) > 0) {
+ imin(ii) = centmap(iss(j, ii)) - 1
+ vmin(ii) = ds(j, ii)
+ continue = false
+ }
+ j += 1
+ }
+ maxdepth = math.max(maxdepth, j)
+
+
+ if (j >= ncache & continue)
+ {
+
+ ispills(spills) = i
+ spills += 1
+ nspills += 1
+
+ imin(ii) = -1
+ vmin(ii) = -1
+
+ }
+ Mat.nflops += 4*j
+ i += 1
+ }
+
+ if(spills>0)
+ {
+
+ ispills = ispills(0,0 until spills)
+
+
+ val dspill = dists(a(icenters(?,0),?),a(ispills,?))
+
+ var (ddd,iii) = maxi2(dspill,1)
+
+ imin(ispills) = centmap(iii)
+ for (i <- 0 until spills){ vmin(ispills(i),0) = ddd(0,i)}
+
+ }
+
+ }
+
+ def mindists(ds:FMat, iss:IMat, isamps:IMat, icenters:IMat):(DMat, IMat) = {
+ val vmin = dzeros(nsamps,1)
+ val imin = izeros(nsamps,1)
+ mindists(ds, iss, isamps, icenters, vmin, imin)
+ (vmin, imin)
+ }
+
+
+ def pointdiffs(ds:FMat, iss:IMat, vd:DMat):DMat = {
+ val deltas = dzeros(nsamps,1) // Array to hold improvements in distance over vd
+
+ var ispills = izeros(1,nsamps)
+ var spills = 0
+
+ var i = 0
+ while (i < nsamps) { // Calculate improvements over vd for new candidate centers
+ var j = 0
+ while (j < ncache && ds(j,i) < vd(i)) { // using sorted order of ds
+ deltas(iss(j,i)) += ds(j,i) - vd(i)
+ j += 1
+ }
+ maxdepth = math.max(maxdepth, j)
+ if (j >= ncache){
+ if(ds(j-1,i) < vd(i))
+ {
+ ispills(spills) = i
+ nspills += 1
+ spills += 1
+ }
+ }
+
+ Mat.nflops += 4 * j
+ i += 1
+ }
+
+ if(spills > 0)
+ {
+
+ ispills = ispills(?,0 until spills)
+ val threshold = ds(ncache-1,ispills)
+
+ val dspill = dists(a)(?,ispills)
+
+ for (i <- 0 until spills){ // Calculate improvements over vd for new candidate centers
+
+ var j = 0
+ val ii = ispills(0,i)
+
+ while (j < nsamps) {
+ if(dspill(j,i) < vd(ii) & dspill(j,i) > threshold(i)) {
+ deltas(j) += dspill(j,i) - vd(ii)
+ }
+ j += 1
+ }
+ }
+
+ }
+
+ deltas
+ }
+
+
+ def sortgen(dd:FMat):(FMat,IMat) = { // Sorts the COLUMNS in ascending order...
+
+
+ if (Mat.hasCUDA <= 0) { // until GPUsort fixed
+ var (smat, imat) = sort2(dd,1)
+
+
+ if(ncache < nsamps){
+ smat = smat(0 until ncache,?)
+ imat = imat(0 until ncache,?)
+ }
+
+ (smat, imat)
+
+ } else {
+
+ var smat = dd.copy
+ var imat = icol(0->nsamps)*iones(1,nsamps)
+
+ GMat.sortGPU(smat, imat)
+
+ if(ncache < nsamps){
+ smat = smat(0 until ncache,?)
+ imat = imat(0 until ncache,?)
+ }
+
+ (smat, imat)
+
+ }
+
+ }
+
+ def run = {
+ println("PAM clustering %d points with %d features into %d centers" format (nsamps, nfeats, ncenters))
+ flip
+ val dd = dists(a)
+ val ft1 = gflop
+ println("Distances in %f seconds, %f gflops" format (ft1._2,ft1._1))
+ flip
+ val (ds, iss) = sortgen(dd)
+
+
+
+
+ Mat.nflops += math.round(math.log(size(ds,1))/math.log(2.0))*size(ds,1)*size(ds,2)
+ val ft2 = gflop
+ println("Sort in %f seconds, %f gcomps" format (ft2._2,ft2._1))
+ var bestv:DMat = null
+ var besti:IMat = null
+ var bestvd = Double.MaxValue
+ flip
+ var itry = 0
+
+ while(itry < ntrys) {
+
+ println("Try %d" format itry)
+ val rr = rand(nsamps,1) // Get a random permutation for the centers
+ val (rs,irs) = sort2(rr,1)
+ val icenters = irs(0->ncenters,0) // Pick centers from the permutation
+ val ics = icol(0->nsamps)
+ mindists(ds, iss, ics, icenters, vdists, imin) // Get min distances from points to centers, and best center ids
+ println(" pass=0, mean dist=%f" format mean(vdists,1).v)
+ val vtmp = vdists.copy
+ val itmp = imin.copy
+ var nchanged = 1
+ var ipass = 0
+ var totchanged = 0
+ while (nchanged > 0 && ipass < options.maxpasses) { // Keep making passes until no improvements
+ ipass += 1
+ nchanged = 0
+ var ipc = 0
+ while (ipc < ncenters) { // Try to improve this center (ipc)
+ vtmp <-- vdists // Copy distances
+ val ifix = find(imin == ipc) // Find points in cluster with this center
+ val tcents = icenters((0->ipc) \ ((ipc+1)->ncenters),0) // List of centers minus the current one
+ mindists(ds, iss, ifix, tcents, vtmp, itmp) // vtmp holds distances to centers minus the current center
+ val deltas = pointdiffs(ds, iss, vtmp) // deltas holds improvements for each potential center over vtmp
+ val (vs,is) = mini2(deltas) // Find best new center
+ if (vs.v + sum(vtmp).v < sum(vdists).v && is.v != icenters(ipc,0)) { // Is the new center better than the old (and not equal to it)?
+ icenters(ipc) = is.v // If yes, update the center list
+ mindists(ds, iss, ics, icenters, vdists, imin) // Compute new distances and centers
+ nchanged += 1
+ if (options.verb) println(" pass=%d, ipc=%d, mean dist=%f, nchanged=%d" format (ipass, ipc, mean(vdists,1).v, nchanged))
+ }
+ ipc += 1
+ }
+ println(" pass=%d, mean dist=%f, nchanged=%d, nspills=%d" format (ipass, mean(vdists,1).v, nchanged, nspills))
+ totchanged += nchanged
+ }
+ val mv = mean(vdists).v
+ if (mv < bestvd) {
+ bestc = icenters
+ bestv = vdists
+ besti = imin
+ bestvd = mv
+ }
+ itry += 1
+ }
+ val t3=gflop
+ val vdists2 = mini(dd(?,bestc),2)
+ println("Optimum in %f secs, %f gflops, mean dist=%f, verify=%f\n maxdepth=%d, nspills=%d, ncache=%d\nTotal time %f seconds" format
+ (t3._2, t3._1, bestvd, mean(DMat(vdists2),1).v, maxdepth, nspills, ncache, t3._2+ft2._2+ft1._2))
+
+ val ics = icol(0->nsamps)
+ flip
+
+ sil= silhouetteScore(ds, iss, ics, bestc,imin)
+ val t4=gflop
+ mss = mean(sil,2)(0,0)
+
+ println("Mean Silhouette Score (MSS) %f \n Elapsed time %f secs" format(mss, t4._1 ))
+
+ }
+
+}
+
+object PAMmodel {
+
+ class Options {
+ var ncenters = 1000
+ var maxpasses = 10
+ var ntrys = 1
+ var metric:Distance = null
+ var verb = false
+ var cbal = 10
+
+ }
+
+ def runit(nsamps:Int, nfeats:Int, ncenters:Int,metric:String) = {
+ println("Generating dataset")
+ val c = rand(ncenters, nfeats)
+ val a = rand(nsamps, nfeats)*0.3f
+ for (i <- 0 until nsamps by ncenters) {val il = math.min(i+ncenters, nsamps); a(i->il,?) += c(0->(il-i),?)}
+ val cc = new PAMmodel
+ cc.options.ncenters = ncenters
+ cc.options.metric = metric match {
+ case "euclid" => new euclidDistance
+ case "cosangle" => new cosangleDistance
+ case "corr" => new correlationDistance
+ }
+ cc.init(a)
+ cc.run
+ }
+
+ def main(args:Array[String]) = {
+ Mat.checkCUDA
+ val nsamps= args(0).toInt
+ val nfeats = args(1).toInt
+ val ncenters = args(2).toInt
+ val metric = args(3)
+ runit(nsamps, nfeats, ncenters, metric)
+ }
+}
+
diff --git a/src/main/scala/BIDMach/Copyright.txt b/src/main/scala/BIDMach/Copyright.txt
new file mode 100755
index 00000000..21326596
--- /dev/null
+++ b/src/main/scala/BIDMach/Copyright.txt
@@ -0,0 +1,25 @@
+Copyright (c) 2012, Regents of the University of California
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
diff --git a/src/main/scala/BIDMach/DataSource.scala b/src/main/scala/BIDMach/DataSource.scala
new file mode 100755
index 00000000..7561b20f
--- /dev/null
+++ b/src/main/scala/BIDMach/DataSource.scala
@@ -0,0 +1,738 @@
+package BIDMach
+import BIDMat.{Mat,BMat,CMat,CSMat,DMat,FMat,IMat,HMat,GMat,GIMat,GSMat,SMat,SDMat}
+import BIDMat.MatFunctions._
+import BIDMat.SciFunctions._
+import scala.actors._
+import java.io._
+
+abstract class DataSource(val opts:DataSource.Options = new DataSource.Options) {
+ def next:Array[Mat]
+ def hasNext:Boolean
+ def reset:Unit
+ def putBack(mats:Array[Mat],i:Int):Unit = {throw new RuntimeException("putBack not implemented")}
+ def setupPutBack(n:Int,dim:Int):Unit = {throw new RuntimeException("putBack not implemented")}
+ def nmats:Int
+ def init:Unit
+ def progress:Float
+ var omats:Array[Mat] = null
+}
+
+class MatDataSource(var mats:Array[Mat], override val opts:MatDataSource.Options = new MatDataSource.Options) extends DataSource(opts) {
+ var sizeMargin = 0f
+ var here = 0
+ var there = 0
+ var blockSize = 0
+ var totalSize = 0
+ var umat:Mat = null
+
+ omats = null
+
+ def init = {
+ sizeMargin = opts.sizeMargin
+ blockSize = opts.blockSize
+ if (opts.addConstFeat) {
+ mats(0) = mats(0) on sparse(ones(1, mats(0).ncols))
+ }
+ if (opts.featType == 0) {
+ mats(0).contents.set(1)
+ }
+ here = -blockSize
+ totalSize = mats(0).ncols
+ omats = new Array[Mat](mats.length)
+ for (i <- 0 until mats.length) {
+ omats(i) = mats(i) match {
+ case mm:SMat => SMat(mats(i).nrows, blockSize, (mats(i).nnz * sizeMargin * blockSize / mats(i).ncols).toInt)
+ case mm:SDMat => SDMat(mats(i).nrows, blockSize, (mats(i).nnz * sizeMargin * blockSize / mats(i).ncols).toInt)
+ case _ => mats(i).zeros(mats(i).nrows, blockSize)
+ }
+ }
+ }
+
+ def nmats = omats.length
+
+ def reset = {
+ here = -blockSize
+ }
+
+ def next:Array[Mat] = {
+ here = math.min(here+blockSize, mats(0).ncols)
+ there = math.min(here+blockSize, mats(0).ncols)
+ for (i <- 0 until mats.length) {
+ omats(i) = mats(i).colslice(here, there, omats(i))
+ }
+ omats
+ }
+
+ def hasNext:Boolean = {
+ here + blockSize < mats(0).ncols
+ }
+
+ override def setupPutBack(n:Int, dim:Int) = {
+ if (mats.length < n) {
+ val newmats = new Array[Mat](n)
+ for (i <- 0 until n-1) {
+ newmats(i) = mats(i)
+ }
+ newmats(n-1) = ones(dim, mats(0).ncols)
+ mats = newmats
+ }
+ }
+
+ override def putBack(tmats:Array[Mat],i:Int):Unit = {
+ tmats(i).colslice(0, tmats(i).ncols, mats(i), here)
+ }
+
+ def progress = {
+ math.min((here+blockSize)*1f/totalSize, 1f)
+ }
+
+}
+
+class FilesDataSource(override val opts:FilesDataSource.Options = new FilesDataSource.Options) extends DataSource(opts) {
+ var sizeMargin = 0f
+ var blockSize = 0
+ @volatile var fileno = 0
+ var rowno = 0
+ var nstart = 0
+ var fnames:List[(Int)=>String] = null
+ omats = null
+ var matqueue:Array[Array[Mat]] = null
+ var ready:IMat = null
+ var stop:Boolean = false
+ var permfn:(Int)=>Int = null
+ var totalSize = 0
+
+ def softperm(nstart:Int, nend:Int) = {
+ val dd1 = nstart / 24
+ val hh1 = nstart % 24
+ val dd2 = nend / 24
+ val hh2 = nend % 24
+ val (dmy, ii) = sort2(rand(dd2-dd1+1+opts.lookahead))
+ (n:Int) => {
+ val dd = n / 24
+ val hh = n % 24
+ val ddx = ii(dd-dd1)+dd1
+ val ddx0 = ddx % 31
+ val ddx1 = ddx / 31
+ val hhdd = hh + 24 * (ddx0 - 1)
+ (ddx1 * 31 + (hhdd % 31 + 1)) * 24 + hhdd / 31
+ }
+ }
+
+ def initbase = {
+ nstart = opts.nstart
+ fnames = opts.fnames
+ blockSize = opts.blockSize
+ while (!fileExists(fnames(0)(nstart))) {nstart += 1}
+ if (opts.order == 1) {
+ val (dmy, rr) = sort2(rand(opts.nend+opts.lookahead+1-nstart,1)) // Randomize the file read order
+ permfn = (a:Int) => rr(a-nstart)+nstart
+ } else {
+ permfn = (n:Int) => { // Stripe reads across disks (different days)
+ val (yy, mm, dd, hh) = FilesDataSource.decodeDate(n)
+ val hhdd = hh + 24 * (dd - 1)
+ FilesDataSource.encodeDate(yy, mm, hhdd % 31 + 1, hhdd / 31)
+ }
+ }
+ fileno = nstart // Number of the current output file
+ rowno = 0 // row number in the current output file
+ totalSize = opts.nend - nstart
+ matqueue = new Array[Array[Mat]](opts.lookahead) // Queue of matrices for each output matrix
+ ready = -iones(opts.lookahead, 1) // Numbers of files currently loaded in queue
+ for (i <- 0 until opts.lookahead) {
+ matqueue(i) = new Array[Mat](fnames.size)
+ }
+ for (i <- 0 until opts.lookahead) {
+ Actor.actor {
+ prefetch(nstart + i)
+ }
+ }
+ }
+
+ def reset = {
+ fileno = nstart
+ rowno = 0
+ for (i <- 0 until opts.lookahead) {
+ val ifile = nstart + i
+ val ifilex = ifile % opts.lookahead
+ ready(ifilex) = ifile - opts.lookahead
+ }
+ }
+
+ def init = {
+ initbase
+ omats = new Array[Mat](fnames.size)
+ for (i <- 0 until fnames.size) {
+ var mm = HMat.loadMat(fnames(i)(nstart))
+ if (opts.dorows) {
+ omats(i) = mm.zeros(blockSize, mm.ncols)
+ } else {
+ omats(i) = mm.zeros(mm.nrows, blockSize)
+ }
+ }
+ }
+
+ def progress = {
+ (fileno-nstart)*1f / totalSize
+ }
+
+ def nmats = omats.length
+
+ def next:Array[Mat] = {
+ var donextfile = false
+ var todo = blockSize
+ while (todo > 0 && fileno < opts.nend) {
+ var nrow = rowno
+ val filex = fileno % opts.lookahead
+ while (ready(filex) < fileno) Thread.sleep(1)
+ for (i <- 0 until fnames.size) {
+ val matq = matqueue(filex)(i)
+ if (matq != null) {
+ val matqnr = if (opts.dorows) matq.nrows else matq.ncols
+ nrow = math.min(rowno + todo, matqnr)
+ if (opts.dorows) {
+ omats(i) = matq.rowslice(rowno, nrow, omats(i), blockSize - todo)
+ } else {
+ omats(i) = matq.colslice(rowno, nrow, omats(i), blockSize - todo)
+ }
+ if (matqnr == nrow) donextfile = true
+ } else {
+ donextfile = true
+ }
+ }
+ todo -= nrow - rowno
+ if (donextfile) {
+ fileno += 1
+ rowno = 0
+ donextfile = false
+ } else {
+ rowno = nrow
+ }
+ }
+ omats
+ }
+
+ def fileExists(fname:String) = {
+ val testme = new File(fname)
+ testme.exists
+ }
+
+ def lazyTranspose(a:Mat) = {
+ a match {
+ case af:FMat => FMat(a.ncols, a.nrows, af.data)
+ case ad:DMat => DMat(a.ncols, a.nrows, ad.data)
+ case ai:IMat => IMat(a.ncols, a.nrows, ai.data)
+ case _ => throw new RuntimeException("laztTranspose cant deal with "+a.getClass.getName)
+ }
+ }
+
+ def prefetch(ifile:Int) = {
+ val ifilex = ifile % opts.lookahead
+ ready(ifilex) = ifile - opts.lookahead
+ while (!stop) {
+ while (ready(ifilex) >= fileno) Thread.sleep(1)
+ val inew = ready(ifilex) + opts.lookahead
+ val pnew = permfn(inew)
+ val fexists = fileExists(fnames(0)(pnew)) && (rand(1,1).v < opts.sampleFiles)
+ for (i <- 0 until fnames.size) {
+ matqueue(ifilex)(i) = if (fexists) {
+ HMat.loadMat(fnames(i)(pnew), matqueue(ifilex)(i))
+ } else null
+// println("%d" format inew)
+ }
+ ready(ifilex) = inew
+ }
+ }
+
+ def hasNext:Boolean = {
+ (fileno < opts.nend)
+ }
+
+}
+
+class SFilesDataSource(override val opts:SFilesDataSource.Options = new SFilesDataSource.Options) extends FilesDataSource(opts) {
+
+ var inptrs:IMat = null
+ var offsets:IMat = null
+
+ override def init = {
+ initbase
+ var totsize = sum(opts.fcounts).v
+ if (opts.addConstFeat) totsize += 1
+ omats = new Array[Mat](1)
+ omats(0) = SMat(totsize, opts.blockSize, opts.blockSize * opts.eltsPerSample)
+ inptrs = izeros(opts.fcounts.length, 1)
+ offsets = 0 on cumsum(opts.fcounts)
+ }
+
+ def binFind(i:Int, mat:Mat):Int = {
+ val imat = mat.asInstanceOf[IMat]
+ val nrows = mat.nrows
+ var ibeg = 0
+ var iend = nrows
+ while (ibeg < iend) {
+ val imid = (iend + ibeg)/2
+ if (i > imat(imid, 0)) {
+ ibeg = imid+1
+ } else {
+ iend = imid
+ }
+ }
+ iend
+ }
+
+ def sprowslice(inmat:Array[Mat], rowno:Int, nrow:Int, omat0:Mat, done:Int):Mat = {
+ val omat = omat0.asInstanceOf[SMat]
+ val ioff = Mat.ioneBased
+ var idone = done
+ var innz = omat.nnz
+ val lims = opts.fcounts
+ val nfiles = opts.fcounts.length
+ val addConstFeat = opts.addConstFeat
+ val featType = opts.featType
+ var j = 0
+ while (j < nfiles) {
+ inptrs(j, 0) = binFind(rowno, inmat(j))
+ j += 1
+ }
+ var irow = rowno
+ while (irow < nrow) {
+ var j = 0
+ while (j < nfiles) {
+ val mat = inmat(j).asInstanceOf[IMat]
+ val mrows = mat.nrows
+ var k = inptrs(j)
+ while (k < mrows && mat.data(k) < irow) k += 1
+ inptrs(j) = k
+ val xoff = innz - k
+ val yoff = offsets(j) + ioff
+ while (k < mat.nrows && mat.data(k) == irow && mat.data(k+mrows) < lims(j)) {
+ omat.ir(xoff + k) = mat.data(k+mrows) + yoff
+ omat.data(xoff + k) = if (featType == 0) 1f else mat.data(k+2*mrows)
+ k += 1
+ }
+ innz = xoff + k
+ inptrs(j) = k
+ j += 1
+ }
+ irow += 1
+ idone += 1
+ if (addConstFeat) {
+ omat.ir(innz) = omat.nrows - 1 + ioff
+ omat.data(innz) = 1
+ innz += 1
+ }
+ omat.jc(idone) = innz + ioff
+ }
+ omat.nnz0 = innz
+ omat
+ }
+
+ def spmax(matq:Array[Mat]):Int = {
+ var maxv = 0
+ for (i <- 0 until matq.length) {
+ if (matq(i) != null) {
+ val mat = matq(i).asInstanceOf[IMat]
+ maxv = math.max(maxv, mat(mat.nrows-1,0))
+ }
+ }
+ maxv
+ }
+
+ def fillup(mat:Mat, todo:Int) = {
+ val smat = mat.asInstanceOf[SMat]
+ val ncols = mat.ncols
+ var i = ncols - todo
+ val theend = smat.jc(i)
+ while (i < ncols) {
+ i += 1
+ smat.jc(i) = theend
+ }
+ }
+
+ def flushMat(mat:Mat) = {
+ val smat = mat.asInstanceOf[SMat]
+ smat.nnz0 = 0
+ smat.jc(0) = Mat.ioneBased
+ }
+
+ override def next:Array[Mat] = {
+ var donextfile = false
+ var todo = blockSize
+ flushMat(omats(0))
+ while (todo > 0 && fileno < opts.nend) {
+ var nrow = rowno
+ val filex = fileno % opts.lookahead
+ while (ready(filex) < fileno) Thread.sleep(1)
+ val spm = spmax(matqueue(filex))
+ nrow = math.min(rowno + todo, spm)
+ val matq = matqueue(filex)
+ if (matq(0) != null) {
+ omats(0) = sprowslice(matq, rowno, nrow, omats(0), blockSize - todo)
+ if (spm == nrow) donextfile = true
+ } else {
+ donextfile = true
+ }
+ todo -= nrow - rowno
+ if (donextfile) {
+ fileno += 1
+ rowno = 0
+ donextfile = false
+ } else {
+ rowno = nrow
+ }
+ }
+ if (todo > 0) {
+ fillup(omats(0), todo)
+ }
+ omats
+ }
+
+}
+
+class BlendedDataSource(val s1:DataSource, val s2:DataSource, var alpha:Float, var samp1:Float, var samp2:Float,
+ override val opts:BlendedDataSource.Options = new BlendedDataSource.Options) extends DataSource(opts) {
+ var sizeMargin = 0f
+ var here = 0L
+ var there = 0
+ var iptr1 = 0
+ var iptr2 = 0
+ var blockSize = 0
+ var bBlock = 0
+ var totalSize = 0
+ var randv:FMat = null
+ var rands1:FMat = null
+ var rands2:FMat = null
+ var mats1:Array[Mat] = null
+ var mats2:Array[Mat] = null
+ omats = null
+
+ def init = {
+ sizeMargin = opts.sizeMargin
+ blockSize = opts.blockSize
+ bBlock = opts.bBlock
+ randv = rand(1, blockSize/bBlock + 1)
+ rands1 = rand(1, blockSize/bBlock + 1)
+ rands2 = rand(1, blockSize/bBlock + 1)
+ here = -blockSize
+ s1.opts.addConstFeat = opts.addConstFeat
+ s2.opts.addConstFeat = opts.addConstFeat
+ s1.opts.featType = opts.featType
+ s2.opts.featType = opts.featType
+ s1.init
+ s2.init
+ mats1 = s1.next
+ mats2 = s2.next
+ totalSize = mats1(0).ncols
+ omats = new Array[Mat](mats1.length)
+ for (i <- 0 until mats1.length) {
+ omats(i) = mats1(i) match {
+ case mm:SMat => SMat(mats1(i).nrows, blockSize, (mats1(i).nnz * sizeMargin).toInt)
+ case mm:SDMat => SDMat(mats1(i).nrows, blockSize, (mats1(i).nnz * sizeMargin).toInt)
+ case _ => mats1(i).zeros(mats1(i).nrows, blockSize)
+ }
+ }
+ }
+
+ def nmats = omats.length
+
+ def reset = {
+ s1.reset
+ s2.reset
+ here = -blockSize
+ }
+
+ @inline def copycol(inmats:Array[Mat], iptr:Int, jptr:Int, omats:Array[Mat], here:Int) = {
+ var imat = 0
+ while (imat < inmats.length) {
+ omats(imat) = inmats(imat).colslice(iptr, jptr, omats(imat), here)
+ imat += 1
+ }
+ }
+
+ def next:Array[Mat] = {
+ rand(0, 1f, randv)
+ var i = 0
+ var xptr = 0
+ while (xptr < blockSize && hascol(mats1, iptr1, s1) && hascol(mats2, iptr2, s2)) {
+ if (randv.data(i) < alpha) {
+ while (iptr1 < mats1(0).ncols && rands1.data(iptr1/bBlock) > samp1) iptr1 += bBlock
+ if (iptr1 >= mats1(0).ncols) {
+ mats1 = s1.next
+ iptr1 = 0
+ rand(0, 1f, samp1)
+ }
+ val jptr1 = math.min(mats1(0).ncols, iptr1 + math.min(bBlock, math.min(blockSize, omats(0).ncols) - xptr))
+ copycol(mats1, iptr1, jptr1, omats, xptr)
+ xptr += jptr1 - iptr1
+ iptr1 = jptr1
+ } else {
+ while (iptr2 < mats2(0).ncols && rands2.data(iptr2/bBlock) > samp2) iptr2 += bBlock
+ if (iptr2 >= mats2(0).ncols) {
+ mats2 = s2.next
+ iptr2 = 0
+ rand(0, 1f, samp2)
+ }
+ val jptr2 = math.min(mats1(0).ncols, iptr2 + math.min(bBlock, math.min(blockSize, omats(0).ncols) - xptr))
+ copycol(mats1, iptr2, jptr2, omats, xptr)
+ xptr += jptr2 - iptr2
+ iptr2 = jptr2
+ }
+ i += 1
+ }
+ here += xptr
+ if (xptr == blockSize) {
+ omats
+ } else {
+ shrinkmats(omats, i)
+ }
+ }
+
+ def hascol(mats:Array[Mat], iptr:Int, ss:DataSource):Boolean = {
+ (iptr < mats(0).ncols) || ss.hasNext
+ }
+
+ def hasNext:Boolean = {
+ hascol(mats1, iptr1, s1) && hascol(mats2, iptr2, s2)
+ }
+
+ def shrinkmats(xmats:Array[Mat], n:Int) = {
+ val outarr = new Array[Mat](omats.length)
+ var imat = 0
+ while (imat < omats.length) {
+ outarr(imat) = xmats(imat).colslice(0, n, null)
+ imat += 1
+ }
+ outarr
+ }
+
+ def progress = {
+ math.max(s1.progress, s2.progress)
+ }
+}
+
+
+object DataSource {
+ class Options {
+ var blockSize = 100000
+ var sizeMargin = 3f
+ var sample = 1f
+ var addConstFeat:Boolean = false
+ var featType:Int = 1 // 0 = binary features, 1 = linear features
+ }
+}
+
+object MatDataSource {
+ class Options extends DataSource.Options {
+
+ }
+}
+
+
+object FilesDataSource {
+
+ def encodeDate(yy:Int, mm:Int, dd:Int, hh:Int) = (((12*yy + mm) * 31) + dd)*24 + hh
+
+ def decodeDate(n:Int):(Int, Int, Int, Int) = {
+ val days = n / 24
+ val dd = (days - 1) % 31 + 1
+ val months = (days - dd) / 31
+ val mm = (months - 1) % 12 + 1
+ val yy = (months - mm) / 12
+ (yy, mm, dd, n % 24)
+ }
+
+ def sampleFun(fname:String):(Int)=>String = {
+ (n:Int) => {
+ val (yy, mm, dd, hh) = decodeDate(n)
+ (fname format ((n / 24) % 16, yy, mm, dd, hh))
+ }
+ }
+
+ def sampleFun(fname:String, m:Int, i:Int):(Int)=>String = {
+ (n0:Int) => {
+ val n = n0 * m + i
+ val (yy, mm, dd, hh) = decodeDate(n)
+ (fname format ((n / 24) % 16, yy, mm, dd, hh))
+ }
+ }
+
+
+ class Options extends DataSource.Options {
+ val localDir:String = ""
+ def fnames:List[(Int)=>String] = null
+ var lookahead = 8
+ var sampleFiles = 1.0f
+ var nstart:Int = 0
+ var nend:Int = 0
+ var dorows:Boolean = true
+ var order:Int = 1 // 0 = sequential order, 1 = random
+ }
+}
+
+object BlendedDataSource {
+ class Options extends DataSource.Options {
+ var bBlock = 1000
+ }
+}
+
+object SFilesDataSource {
+ class Options extends FilesDataSource.Options {
+ var fcounts:IMat = null
+ var eltsPerSample = 0
+ }
+
+ val twitterFeatureDir = "/disk%02d/twitter/featurized/%04d/%02d/%02d/"
+ val twitterSmileyFeatureDir = "/disk%02d/twitter/smiley/featurized/%04d/%02d/%02d/"
+
+ def twitterWords(
+ nstart0:Int = FilesDataSource.encodeDate(2012,3,1,0),
+ nend0:Int = FilesDataSource.encodeDate(2012,12,1,0),
+ n:Int = 1,
+ i:Int = 0,
+ nfeats:Int = 100000) = {
+ val opts = new SFilesDataSource.Options {
+ override def fnames:List[(Int)=>String] = List(FilesDataSource.sampleFun(twitterFeatureDir + "unifeats%02d.lz4", n, i))
+ fcounts = icol(nfeats)
+ nstart = nstart0/n
+ nend = nend0/n
+ order = 1
+ blockSize = 100000
+ eltsPerSample = 40
+ lookahead = 3
+ }
+ new SFilesDataSource(opts)
+ }
+
+ def twitterSmileyWords(
+ nstart0:Int = FilesDataSource.encodeDate(2012,3,1,0),
+ nend0:Int = FilesDataSource.encodeDate(2013,7,1,0),
+ n:Int = 1,
+ i:Int = 0,
+ nfeats:Int = 100000) = {
+ val opts = new SFilesDataSource.Options {
+ override def fnames:List[(Int)=>String] = List(FilesDataSource.sampleFun(twitterSmileyFeatureDir + "unifeats%02d.lz4", n, i))
+ fcounts = icol(nfeats)
+ nstart = nstart0/n
+ nend = nend0/n
+ order = 1
+ blockSize = 100000
+ eltsPerSample = 40
+ lookahead = 3
+ }
+ new SFilesDataSource(opts)
+ }
+
+ def twitterNgrams(
+ nstart0:Int = FilesDataSource.encodeDate(2012,3,1,0),
+ nend0:Int = FilesDataSource.encodeDate(2012,12,1,0),
+ n:Int = 1,
+ i:Int = 0,
+ nuni0:Int = 50,
+ nbi0:Int = 100,
+ ntri0:Int = 200) = {
+ val opts = new SFilesDataSource.Options {
+ override def fnames:List[(Int)=>String] = List(
+ FilesDataSource.sampleFun(twitterFeatureDir + "unifeats%02d.lz4", n, i),
+ FilesDataSource.sampleFun(twitterFeatureDir + "bifeats%02d.lz4", n, i),
+ FilesDataSource.sampleFun(twitterFeatureDir + "trifeats%02d.lz4", n, i)
+ )
+ fcounts = icol(nuni0*1000,nbi0*1000,ntri0*1000)
+ nstart = nstart0/n
+ nend = nend0/n
+ order = 1
+ blockSize = 100000
+ eltsPerSample = 40
+ lookahead = 3
+ }
+ new SFilesDataSource(opts)
+ }
+
+ def twitterSmileyNgrams(
+ nstart0:Int = FilesDataSource.encodeDate(2012,3,1,0),
+ nend0:Int = FilesDataSource.encodeDate(2013,7,1,0),
+ n:Int = 1,
+ i:Int = 0,
+ nuni0:Int = 50,
+ nbi0:Int = 100,
+ ntri0:Int = 200) = {
+ val opts = new SFilesDataSource.Options {
+ override def fnames:List[(Int)=>String] = List(
+ FilesDataSource.sampleFun(twitterSmileyFeatureDir + "unifeats%02d.lz4", n, i),
+ FilesDataSource.sampleFun(twitterSmileyFeatureDir + "bifeats%02d.lz4", n, i),
+ FilesDataSource.sampleFun(twitterSmileyFeatureDir + "trifeats%02d.lz4", n, i)
+ )
+ fcounts = icol(nuni0*1000,nbi0*1000,ntri0*1000)
+ nstart = nstart0/n
+ nend = nend0/n
+ order = 1
+ blockSize = 100000
+ eltsPerSample = 40
+ lookahead = 3
+ }
+ new SFilesDataSource(opts)
+ }
+
+ def twitterWordBlend(
+ nstart0:Int = FilesDataSource.encodeDate(2012,3,1,0),
+ nend0:Int = FilesDataSource.encodeDate(2013,7,1,0),
+ n:Int = 1,
+ i:Int = 0,
+ nfeats:Int = 10000) = {
+ val ds1 = twitterWords(nstart0, nend0, n, i, nfeats)
+ val ds2 = twitterSmileyWords(nstart0, nend0, n, i, nfeats)
+ if (n > 1) {
+ ds1.opts.lookahead = 2
+ ds2.opts.lookahead = 2
+ }
+ val opts3 = new BlendedDataSource.Options
+ new BlendedDataSource(ds1, ds2, 0.5f, 1f, 1f, opts3)
+ }
+
+ def twitterNgramBlend(
+ nstart0:Int = FilesDataSource.encodeDate(2012,3,1,0),
+ nend0:Int = FilesDataSource.encodeDate(2013,7,1,0),
+ n:Int = 1,
+ i:Int = 0,
+ nuni0:Int = 50,
+ nbi0:Int = 100,
+ ntri0:Int = 200) = {
+ val ds1 = twitterNgrams(nstart0, nend0, n, i, nuni0, nbi0, ntri0)
+ val ds2 = twitterSmileyNgrams(nstart0, nend0, n, i, nuni0, nbi0, ntri0)
+ if (n > 1) {
+ ds1.opts.lookahead = 2
+ ds2.opts.lookahead = 2
+ }
+ val opts3 = new BlendedDataSource.Options
+ new BlendedDataSource(ds1, ds2, 0.7f, 1f, 1f, opts3)
+ }
+
+ def testSources(nthreads:Int=4,ff:(Int,Int,Int,Int,Int)=>DataSource = twitterWords, nfeats:Int=100000):IMat = {
+ val nstart0 = FilesDataSource.encodeDate(2012,3,22,0)
+ val nend0 = FilesDataSource.encodeDate(2013,7,1,0)
+ var bytes = 0L
+ var done = 0L
+ var step = 10000000000L
+ var stop = izeros(1,1)
+ tic
+ for (i <- 0 until nthreads) {
+ scala.actors.Actor.actor {
+ val ss = ff(nstart0, nend0, nthreads, i, nfeats)
+ ss.init
+ while (ss.hasNext && stop.v != 1) {
+ val a = ss.next
+ bytes += 12L*a(0).nnz
+ if (bytes > done + step) {
+ done = (bytes/step)*step
+ val t=toc
+ println("GB=%4.2f, t=%4.2f, MB/s=%4.2f" format (bytes/1e9, t, bytes/t/1e6))
+ }
+ }
+ val t = toc
+ println("Thread %d done, GB=%4.2f, t=%4.2f, MB/s=%4.2f" format (i, bytes/1e9, t, bytes/t/1e6))
+ }
+ }
+ stop
+ }
+}
+
diff --git a/src/main/scala/BIDMach/Distance.scala b/src/main/scala/BIDMach/Distance.scala
new file mode 100644
index 00000000..ec94dda8
--- /dev/null
+++ b/src/main/scala/BIDMach/Distance.scala
@@ -0,0 +1,125 @@
+package BIDMach
+import BIDMat.{Mat,BMat,CMat,DMat,FMat,IMat,HMat,GMat,GIMat,GSMat,SMat,SDMat}
+import BIDMat.MatFunctions._
+import BIDMat.SciFunctions._
+
+abstract class Distance(val opts:Distance.Options = new Distance.Options) {
+
+ def dists(a:FMat):FMat
+ def dists(a:FMat, b:FMat):FMat
+
+}
+
+object Distance {
+ class Options {
+
+ }
+}
+
+
+//Euclidean Distance
+class euclidDistance(override val opts:euclidDistance.Options = new euclidDistance.Options) extends Distance(opts) {
+
+ override def dists(a:FMat):FMat = {
+ val dd = if (Mat.hasCUDA > 0) a xTG a else a xT a;
+ val d1 = getdiag(dd)
+ dd ~ dd * 2.0f
+ dd ~ d1 - dd
+ dd ~ dd + (d1.t)
+ max(dd, 0f, dd)
+ sqrt(dd, dd)
+ dd
+ }
+
+ override def dists(a:FMat, b:FMat):FMat = {
+ val aa = getdiag(if (Mat.hasCUDA > 0) a xTG a else a xT a);
+ val bb = getdiag(if (Mat.hasCUDA > 0) b xTG b else b xT b);
+ val ab = if (Mat.hasCUDA > 0) a xTG b else a xT b;
+ ab ~ ab * 2.0f
+ ab ~ aa - ab + (bb.t)
+ max(ab, 0f, ab)
+ sqrt(ab, ab)
+
+ ab
+ }
+}
+
+object euclidDistance {
+ class Options extends Distance.Options {
+
+ }
+}
+
+
+//Cosangle Distance
+class cosangleDistance(override val opts:cosangleDistance.Options = new cosangleDistance.Options) extends Distance(opts) {
+
+ override def dists(a:FMat):FMat = {
+ val dd = if (Mat.hasCUDA > 0) a xTG a else a xT a;
+ var d1 = getdiag(dd)
+ sqrt(d1, d1)
+ d1 = if (Mat.hasCUDA > 0) 0.0f+(d1 xTG d1) else 0.0f+(d1 xT d1);
+ dd ~ 1 - dd / d1
+ dd
+ }
+
+ override def dists(a:FMat, b:FMat):FMat = {
+ val aa = getdiag(if (Mat.hasCUDA > 0) a xTG a else a xT a);
+ val bb = getdiag(if (Mat.hasCUDA > 0) b xTG b else b xT b);
+ val ab = if (Mat.hasCUDA > 0) a xTG b else a xT b;
+
+ sqrt(aa, aa)
+ sqrt(bb, bb)
+
+ val dd = if (Mat.hasCUDA > 0) 0.0f+(aa xTG bb) else 0.0f+(aa xT bb);
+ ab ~ 1 - ab / dd
+ ab
+ }
+}
+
+object cosangleDistance {
+ class Options extends Distance.Options {
+
+ }
+}
+
+
+//Correlation Distance
+class correlationDistance(override val opts:correlationDistance.Options = new correlationDistance.Options) extends Distance(opts) {
+
+ override def dists(a:FMat):FMat = {
+ val mu = mean(a,1)
+ a ~ a - mu
+ val dd = if (Mat.hasCUDA > 0) a xTG a else a xT a;
+ var d1 = getdiag(dd)
+ sqrt(d1, d1)
+ d1 = if (Mat.hasCUDA > 0) 0+(d1 xTG d1) else 0+(d1 xT d1);
+ dd ~ 1 - dd / d1
+ dd
+ }
+
+ override def dists(a:FMat, b:FMat):FMat = {
+ val mua = mean(a,1)
+ a ~ a - mua
+ val mub = mean(b,1)
+ b ~ b - mub
+
+ val aa = getdiag(if (Mat.hasCUDA > 0) a xTG a else a xT a);
+ val bb = getdiag(if (Mat.hasCUDA > 0) b xTG b else b xT b);
+ val ab = if (Mat.hasCUDA > 0) a xTG b else a xT b;
+
+ sqrt(aa, aa)
+ sqrt(bb, bb)
+
+ val dd = if (Mat.hasCUDA > 0) 0.0f+(aa xTG bb) else 0.0f+(aa xT bb);
+ ab ~ 1 - ab / dd
+ ab
+ }
+}
+
+object correlationDistance {
+ class Options extends Distance.Options {
+
+ }
+}
+
diff --git a/src/main/scala/BIDMach/Experiments.scala b/src/main/scala/BIDMach/Experiments.scala
new file mode 100755
index 00000000..82e30a34
--- /dev/null
+++ b/src/main/scala/BIDMach/Experiments.scala
@@ -0,0 +1,264 @@
+package BIDMach
+import BIDMat.{Mat,BMat,CMat,CSMat,Dict,DMat,FMat,IDict,IMat,HMat,GMat,GIMat,GSMat,SMat,SDMat}
+import BIDMat.MatFunctions._
+import BIDMat.SciFunctions._
+import scala.actors._
+import java.io._
+
+
+object Twitter {
+
+ def dodicts(threshold:Int=10, rebuild:Boolean=false):Unit = {
+ val stokdir = "/twitter/smiley/tokenized/"
+ val tokdir = "/twitter/tokenized/"
+ val dy1 = mergedicts(2011, 2013, "/disk%02d" + stokdir, "/big" + stokdir, threshold, rebuild)
+ val dy2 = mergedicts(2011, 2013, "/disk%02d" + tokdir, "/big" + tokdir, threshold, rebuild)
+ val dy = Dict.union(dy1, dy2)
+ val (sv, iv) = sortdown2(dy.counts)
+ HMat.saveBMat("/big"+tokdir+"alldict.gz", BMat(dy.cstr(iv)))
+ HMat.saveDMat("/big"+tokdir+"allwcount.gz", sv)
+ }
+
+ def mergedicts(year1:Int, year2:Int, infname:String, outfname:String, threshold:Int=10, rebuild:Boolean=false):Dict = {
+ val dd = new Array[Dict](6)
+ val md = new Array[Dict](6)
+ val yd = new Array[Dict](5)
+ var dy:Dict = null
+ var nmerged = 0
+ for (yy <- year1 to year2) {
+ for (mm <- 1 to 12) {
+ print("\n%d/%02d" format (yy, mm))
+ val ff = new File(outfname + "%04d/%02d/wcount.gz" format (yy, mm))
+ if (rebuild || ! ff.exists) {
+ var ndone = 0
+ for (id <- 1 to 31) {
+ var ielem = 372*yy + 31*mm + id
+ var idisk = ielem % 16
+ val fname = (infname + "%04d/%02d/%02d/" format (idisk, yy, mm, id))
+ val ff = new File(fname + "wcount.gz")
+ if (ff.exists) {
+ val bb = HMat.loadBMat(fname + "dict.gz")
+ val cc = HMat.loadIMat(fname + "wcount.gz")
+ dd(ndone % 6) = Dict(bb, cc, threshold)
+ ndone = ndone + 1
+ print("-")
+ if (ndone % 6 == 0) {
+ md(ndone / 6 - 1) = Dict.union(dd:_*)
+ print("+")
+ }
+ }
+ }
+ if (ndone % 6 != 0) {
+ md(ndone / 6) = Dict.union(dd.slice(0, ndone % 6):_*)
+ print("+")
+ }
+ if (ndone > 0) {
+ val dx = Dict.union(md.slice(0, (ndone-1)/6+1):_*)
+ val (sv, iv) = sortdown2(dx.counts)
+ val dxx = Dict(dx.cstr(iv), sv)
+ HMat.saveBMat(outfname + "%04d/%02d/dict.gz" format (yy, mm), BMat(dxx.cstr))
+ HMat.saveDMat(outfname + "%04d/%02d/wcount.gz" format (yy, mm), dxx.counts)
+ }
+// println("")
+ }
+ val f2 = new File(outfname + "%04d/%02d/wcount.gz" format (yy, mm))
+ if (f2.exists) {
+ val bb = HMat.loadBMat(outfname + "%04d/%02d/dict.gz" format (yy, mm))
+ val cc = HMat.loadDMat(outfname + "%04d/%02d/wcount.gz" format (yy, mm))
+ yd(nmerged % 5) = Dict(bb, cc, 4*threshold)
+ nmerged += 1
+ print("*")
+ if (nmerged % 5 == 0) {
+ val dm = Dict.union(yd:_*)
+ if (nmerged == 5) {
+ dy = dm
+ } else {
+ dy = Dict.union(dy, dm)
+ }
+ }
+ }
+ }
+ }
+ if (nmerged % 5 != 0) {
+ val dm = Dict.union(yd.slice(0, nmerged % 5):_*)
+ dy = Dict.union(dy, dm)
+ }
+ println
+ val (sv, iv) = sortdown2(dy.counts)
+ val dyy = Dict(dy.cstr(iv), sv)
+ HMat.saveBMat(outfname + "dict.gz", BMat(dyy.cstr))
+ HMat.saveDMat(outfname + "wcount.gz", dyy.counts)
+ dyy
+ }
+
+ def getDict = {
+ val bd = loadBMat("/big/twitter/tokenized/alldict.gz")
+ val bc = loadDMat("/big/twitter/tokenized/allwcount.gz")
+ Dict(bd, bc)
+ }
+
+ def getBiDict = {
+ val bd = loadIMat("/big/twitter/tokenized/allbdict.lz4")
+ val bc = loadDMat("/big/twitter/tokenized/allbcnts.lz4")
+ IDict(bd, bc)
+ }
+
+ def getTriDict = {
+ val bd = loadIMat("/big/twitter/tokenized/alltdict.lz4")
+ val bc = loadDMat("/big/twitter/tokenized/alltcnts.lz4")
+ IDict(bd, bc)
+ }
+
+ def junk:CSMat = {
+ csrow("", "", "", "", "", "", "",
+ "", "", "", "", "", "", "",
+ "", "", "", "", "", "" +
+ "", "", "", "", "", "", "", "",
+ "", "", "", "",
+ "", "", "", "", "", "",
+ "", "", "", "", "", "", "", "", "", "",
+ "", "", "", "", "", "",
+ "http", "https", "apos", "kml", "amp", "www", "quot", "id", "latitude", "longitude", "latlonbox", "geo", "json")
+ }
+
+ def findEmoticons(n:Int, dd:Dict) = {
+ val smiles = csrow(":-)", ":)", ":o)", ":]", ":3", ":c)", ":>", "=]", "8)", "=)", ":}", ":^)", ":っ)")
+ val laughs = csrow(":-d", ":d", "8-d", "8d", "x-d", "xd", "x-x", "=-d", "=d", "=-3", "=3", "b^d")
+ val frowns = csrow(">:[", ":-(", ":(", "", ":-c", ":c", ":-<", "", ":っc", ":<", ":-[", ":[", ":{")
+ val angry = csrow(":-||", ":@", ">:(")
+ val crying = csrow(":'-(", ":'(", "qq")
+ val horror = csrow("d:<", "d:", "d8", "d;", "d=", "dx", "v.v", "d-':")
+ val surprise = csrow(">:o", ":-o", ":o", "°o°", "°o°", ":o", "o_o", "o_0", "o.o", "8-0")
+ val wink = csrow(";-)", ";)", "*-)", "*)", ";-]", ";]", ";d", ";^)", ":-,")
+ val all = List(smiles, laughs, frowns, angry, crying, horror, surprise, wink, junk)
+ val out = zeros(all.length, n)
+ for (i <- 0 until all.length) {
+ val mm = all(i)
+ var j = 0
+ while (j < mm.length) {
+ val k = dd(mm(j))
+ if (k >= 0 && k < n) out(i, k) = 1
+ j += 1
+ }
+ }
+ out
+ }
+
+ def getGramDict(nuni0:Int=50, nbi0:Int=100, ntri0:Int=200, rebuild:Boolean=false):Dict = {
+ val nuni = nuni0 * 1000
+ val nbi = nbi0 * 1000
+ val ntri = ntri0 * 1000
+ val fname = "/big/twitter/tokenized/dict_%d_%d_%d" format (nuni0, nbi0, ntri0)
+ if (!rebuild && (new File(fname + "_bmat.lz4").exists) && (new File(fname + "_dmat.lz4").exists)) {
+ val bm = loadBMat(fname + "_bmat.lz4")
+ val dm = loadDMat(fname + "_dmat.lz4")
+ Dict(bm, dm)
+ } else {
+ val ud = getDict
+ val bd = getBiDict
+ val td = getTriDict
+ val dd = IDict.gramDict(nuni, nbi, ntri, ud, bd, td)
+ saveBMat(fname + "_bmat.lz4", BMat(dd.cstr))
+ saveDMat(fname + "_dmat.lz4", dd.counts)
+ dd
+ }
+ }
+
+ def getEmoticonMap(nuni0:Int=50, nbi0:Int=100, ntri0:Int=200, rebuild:Boolean=false):FMat = {
+ val nuni = nuni0 * 1000
+ val nbi = nbi0 * 1000
+ val ntri = ntri0 * 1000
+ val fname = "/big/twitter/tokenized/dict_%d_%d_%d" format (nuni0, nbi0, ntri0)
+ if (!rebuild && (new File(fname + "_emos.lz4").exists)) {
+ loadFMat(fname + "_emos.lz4")
+ } else {
+ val ud = getDict
+ val bdt = getBiDict.grams(0->nbi,?)
+ val tdt = getTriDict.grams(0->ntri,?)
+ val em = findEmoticons(1 + maxi(irow(nuni) \ maxi(bdt) \ maxi(tdt)).v, ud)
+ val bv = zeros(em.nrows, nbi)
+ val tv = zeros(em.nrows, ntri)
+ for (i <- 0 until em.nrows) {
+ bv(i, ?) = max(em(i, bdt(?, 0)), em(i, bdt(?, 1)))
+ tv(i, ?) = max(em(i, tdt(?, 0)), max(em(i, tdt(?, 1)), em(i, tdt(?, 2))))
+ }
+ val emos = em(?, 0->nuni) \ bv(?, 0->nbi) \ tv(?, 0->ntri)
+ saveFMat(fname + "_emos.lz4", emos)
+ emos
+ }
+ }
+
+ def logisticModelPar(
+ nstart0:Int = FilesDataSource.encodeDate(2012,3,1,0),
+ nend0:Int = FilesDataSource.encodeDate(2013,7,1,0),
+ nuni0:Int = 50,
+ nbi0:Int = 100,
+ ntri0:Int = 200
+ ) = {
+ val ds = SFilesDataSource.twitterNgramBlend(nstart0, nend0)
+// val ds = SFilesDataSource.twitterWords(nstart0, nend0)
+ ds.opts.addConstFeat = true
+ ds.opts.featType = 0
+ val gd = getGramDict(nuni0, nbi0, ntri0)
+ val em = getEmoticonMap(nuni0, nbi0, ntri0)
+ val nfeats = gd.length + 1
+ val mask = (sum(em) == 0f) \ 1
+// val targets = em(0->(em.nrows-1), ?) \ zeros(em.nrows-1,1)
+ val targets = em(0->1, ?) \ 0
+ val ntargets = targets.nrows
+ val exptsv = col(0.5, 0.6, 0.7, 0.8, 0.9, 1.0)
+ val exptst = col(0.5, 0.6, 0.7, 0.8, 0.9, 1.0)
+// val expts = col(0.5)
+ val avalues = col(0.1f, 1f, 10f)
+ val expts1 = ones(avalues.length*ntargets, 1) ⊗ exptsv ⊗ ones(exptst.length, 1)
+ val expts2 = ones(avalues.length*exptsv.length*ntargets, 1) ⊗ exptst
+ val alphas = ones(ntargets, 1) ⊗ avalues ⊗ ones(exptst.length*exptsv.length, 1)
+ val aopts = new ADAGradUpdater.Options
+ aopts.vecExponent = expts1
+ aopts.timeExponent = expts2
+ aopts.alpha = alphas
+ aopts.mask = mask
+ val gopts = new GLMmodel.Options
+ gopts.links = iones(expts1.length, 1)
+ gopts.mask = mask
+ gopts.targmap = mkdiag(ones(ntargets, 1)) ⊗ ones(expts1.length/ntargets, 1)
+ gopts.targets = targets
+ new LearnFParModelx(ds, gopts, GLMmodel.mkGLMmodel _, aopts, GLMmodel.mkUpdater _)
+ }
+
+ def logisticModel(
+ mat:SMat,
+ ntargs:Int = 1,
+ exptsv:FMat = col(0.4, 0.5, 0.6),
+ exptst:FMat = col(0.4, 0.5, 0.6),
+ avalues:FMat = col(0.1, 0.3, 1),
+ nuni0:Int = 50,
+ nbi0:Int = 100,
+ ntri0:Int = 200
+ ) = {
+ val ds = new MatDataSource(Array(mat:Mat))
+ val gd = getGramDict(nuni0, nbi0, ntri0)
+ val em = getEmoticonMap(nuni0, nbi0, ntri0)
+ val nfeats = gd.length + 1
+ val mask = (sum(em) == 0f) \ 1
+ val targets0 = em(0->(em.nrows-1), ?) \ zeros(em.nrows-1,1)
+ val targets = targets0(0->ntargs, ?)
+ val ntargets = targets.nrows
+ val expts1 = ones(avalues.length*ntargets, 1) ⊗ exptsv ⊗ ones(exptst.length, 1)
+ val expts2 = ones(avalues.length*exptsv.length*ntargets, 1) ⊗ exptst
+ val alphas = ones(ntargets, 1) ⊗ avalues ⊗ ones(exptst.length*exptsv.length, 1)
+ val aopts = new ADAGradUpdater.Options
+ aopts.vecExponent = expts1
+ aopts.timeExponent = expts2
+ aopts.alpha = alphas
+ aopts.mask = mask
+ val gopts = new GLMmodel.Options
+ gopts.links = iones(expts1.length, 1)
+ gopts.mask = mask
+ gopts.targmap = mkdiag(ones(ntargets, 1)) ⊗ ones(expts1.length/ntargets, 1)
+ gopts.targets = targets
+ Learner(ds, new GLMmodel(gopts), null, new ADAGradUpdater(aopts))
+ }
+
+
+}
\ No newline at end of file
diff --git a/src/main/scala/BIDMach/FactorModel.scala b/src/main/scala/BIDMach/FactorModel.scala
new file mode 100755
index 00000000..f9c7df63
--- /dev/null
+++ b/src/main/scala/BIDMach/FactorModel.scala
@@ -0,0 +1,76 @@
+package BIDMach
+
+import BIDMat.{Mat,BMat,CMat,DMat,FMat,IMat,HMat,GMat,GIMat,GSMat,SMat,SDMat}
+import BIDMat.MatFunctions._
+import BIDMat.SciFunctions._
+
+abstract class FactorModel(override val opts:FactorModel.Options) extends Model(opts) {
+
+ override def init(datasource:DataSource) = {
+ super.init(datasource)
+ val data0 = mats(0)
+ val m = size(data0, 1)
+ val d = opts.dim
+ val sdat = (sum(data0,2).t + 1.0f).asInstanceOf[FMat]
+ val sp = sdat / sum(sdat)
+ println("initial perplexity=%f" format math.exp(- (sp ddot ln(sp))) )
+
+ val modelmat = rand(d,m)
+ modelmat ~ modelmat *@ sdat
+ val msum = sum(modelmat, 2)
+ modelmat ~ modelmat / msum
+ modelmats = new Array[Mat](1)
+ modelmats(0) = if (opts.useGPU && Mat.hasCUDA > 0) GMat(modelmat) else modelmat
+ datasource.reset
+
+ if (mats.size > 1) {
+ while (datasource.hasNext) {
+ mats = datasource.next
+ val dmat = mats(1)
+ dmat.set(1.0f/d)
+ datasource.putBack(mats,1)
+ }
+ }
+ }
+
+ def reuseuser(a:Mat):Mat = {
+ val out = a match {
+ case aa:SMat => FMat.newOrCheckFMat(opts.dim, a.ncols, null, a.GUID, "reuseuser".##)
+ case aa:GSMat => GMat.newOrCheckGMat(opts.dim, a.ncols, null, a.GUID, "reuseuser".##)
+ }
+ out.set(1f)
+ out
+ }
+
+ def uupdate(data:Mat, user:Mat)
+
+ def mupdate(data:Mat, user:Mat)
+
+ def mupdate2(data:Mat, user:Mat) = {}
+
+ def evalfun(data:Mat, user:Mat):FMat
+
+ def doblock(gmats:Array[Mat], i:Long) = {
+ val sdata = gmats(0)
+ val user = if (gmats.length > 1) gmats(1) else reuseuser(gmats(0))
+ uupdate(sdata, user)
+ mupdate(sdata, user)
+ }
+
+ def evalblock(mats:Array[Mat]):FMat = {
+ val sdata = gmats(0)
+ val user = if (gmats.length > 1) gmats(1) else reuseuser(gmats(0))
+ uupdate(sdata, user)
+ evalfun(sdata, user)
+ }
+}
+
+object FactorModel {
+ class Options extends Model.Options {
+ var uiter = 8
+ var weps = 1e-10f
+ var minuser = 1e-8f
+ }
+}
+
+
diff --git a/src/main/scala/BIDMach/Featurizer.scala b/src/main/scala/BIDMach/Featurizer.scala
new file mode 100755
index 00000000..f5946d76
--- /dev/null
+++ b/src/main/scala/BIDMach/Featurizer.scala
@@ -0,0 +1,627 @@
+package BIDMach
+import BIDMat.{Mat,BMat,CMat,CSMat,Dict,DMat,FMat,GMat,GIMat,GSMat,HMat,IDict,IMat,SMat,SDMat}
+import BIDMat.MatFunctions._
+import BIDMat.SciFunctions._
+import scala.actors._
+import scala.annotation.switch
+import java.io._
+
+class Featurizer(val opts:Featurizer.Options = new Featurizer.Options) {
+
+ var alldict:Dict = null
+ var allbdict:IDict = null
+ var alltdict:IDict = null
+
+ def mergeDicts(rebuild:Int,dictname:String="dict.gz",wcountname:String="wcount.gz"):Dict = {
+ val dd = new Array[Dict](5) // Big enough to hold log2(days per month)
+ val nmonths = 2 + (opts.nend - opts.nstart)/31
+ val md = new Array[Dict](1+(math.log(nmonths)/math.log(2)).toInt) // Big enough to hold log2(num months)
+ println("Building monthly dicts for "+opts.thisDir)
+ for (d <- opts.nstart to opts.nend) { // Conditional on rebuild, merge the dictionaries for each month
+ val (year, month, day) = Featurizer.decodeDate(d)
+ val fm = new File(opts.fromMonthDir(d) + wcountname)
+ if (rebuild > 1 || ! fm.exists) {
+ val fd = new File(opts.fromDayDir(d) + wcountname)
+ if (fd.exists) {
+ val bb = loadBMat(opts.fromDayDir(d) + dictname)
+ val cc = loadIMat(opts.fromDayDir(d) + wcountname)
+ Dict.treeAdd(Dict(bb, cc, opts.threshold), dd)
+ print(".")
+ }
+ if (day == 31) {
+ val dx = Dict.treeFlush(dd)
+ if (dx != null) {
+ val (sv, iv) = sortdown2(dx.counts)
+ val dxx = Dict(dx.cstr(iv), sv)
+ val fd = new File(opts.fromMonthDir(d))
+ if (!fd.exists) fd.mkdirs
+ saveBMat(opts.fromMonthDir(d)+dictname, BMat(dxx.cstr))
+ saveDMat(opts.fromMonthDir(d)+wcountname, dxx.counts)
+ println("%04d-%02d" format (year,month))
+ }
+ }
+ }
+ }
+ if (rebuild > 0) {
+ println("Merging monthly dicts for "+opts.thisDir)
+ for (d <- opts.nstart to opts.nend) { // Conditionally merge all monthly dictionaries
+ val (year, month, day) = Featurizer.decodeDate(d)
+ if (day == 31) {
+ val fm = new File(opts.fromMonthDir(d) + wcountname)
+ if (fm.exists) {
+ val bb = loadBMat(opts.fromMonthDir(d) + dictname)
+ val cc = loadDMat(opts.fromMonthDir(d) + wcountname)
+ Dict.treeAdd(Dict(bb, cc, 4*opts.threshold), md)
+ println("%04d-%02d" format (year,month))
+ }
+ }
+ }
+ println
+ val dy = Dict.treeFlush(md) // Get merged dictionary, sort by counts descending
+ val (sv, iv) = sortdown2(dy.counts)
+ val dyy = Dict(dy.cstr(iv), sv)
+ saveBMat(opts.thisDir + dictname, BMat(dyy.cstr))
+ saveDMat(opts.thisDir + wcountname, dyy.counts)
+ dyy
+ } else {
+ Dict(loadBMat(opts.thisDir + dictname), loadDMat(opts.thisDir + wcountname))
+ }
+ }
+
+ def mergeIDicts(rebuild:Int = 0, dictname:String="bdict.lz4", wcountname:String="bcnts.lz4", mapit:Boolean=true):IDict = {
+ println("Building monthly IDicts for " + opts.thisDir + " " + dictname)
+ if (alldict == null) alldict = Dict(loadBMat(opts.mainDict))
+ val dd = new Array[IDict](5) // Big enough to hold log2(days per month)
+ val nmonths = 2 + (opts.nend - opts.nstart)/31
+ val md = new Array[IDict](1+(math.log(nmonths)/math.log(2)).toInt) // Big enough to hold log2(num months)
+ var dy:IDict = null
+ var mdict:Dict = null
+ var domonth:Boolean = false
+ var lastmonth = 0
+ for (d <- opts.nstart to opts.nend) {
+ val (year, month, day) = Featurizer.decodeDate(d)
+ if (month != lastmonth) {
+ val dfname = opts.fromMonthDir(d) + opts.localDict
+ if (fileExists(dfname)) {
+ mdict = Dict(loadBMat(dfname)) // Load token dictionary for this month
+ val fm = new File(opts.fromMonthDir(d) + wcountname) // Did we process this month?
+ domonth = rebuild > 1 || !fm.exists
+ } else {
+ mdict = null
+ domonth = false
+ }
+ lastmonth = month
+ }
+ if (domonth) {
+ val fd = new File(opts.fromDayDir(d) + wcountname)
+ if (fd.exists) {
+ val bb = loadIMat(opts.fromDayDir(d) + dictname) // Load IDict info for this day
+ val cc = loadDMat(opts.fromDayDir(d) + wcountname)
+
+// Kludge to deal with (old) scanner problem
+ val ig = find(maxi(bb, 2) < 0x7fffffff)
+ val bb2 = bb(ig, ?)
+ val bm = if (mapit) {
+ val dict = Dict(loadBMat(opts.fromDayDir(d) + opts.localDict)) // Load token dictionary for this day
+ val map = dict --> mdict // Map from this days tokens to month dictionary
+ map(bb2) // Map the ngrams
+ } else {
+ bb2
+ }
+ val cc2 = cc(ig,0)
+// Done kludge
+ val igood = find(mini(bm, 2) >= 0) // Find the good ones
+ val bg = bm(igood,?)
+ val cg = cc2(igood)
+ val ip = icol(0->igood.length)
+ sortlexInds(bg, ip) // lex sort them
+ IDict.treeAdd(IDict(bg, cg(ip), opts.threshold), dd) // accumulate them
+ print(".")
+ }
+ if (day == 31) { // On the last day, save the accumulated results
+ val dx = IDict.treeFlush(dd)
+ if (dx != null) {
+ saveIMat(opts.fromMonthDir(d)+dictname, dx.grams)
+ saveDMat(opts.fromMonthDir(d)+wcountname, dx.counts)
+ }
+ println("%04d-%02d" format (year,month))
+ }
+ }
+ }
+ if (rebuild > 0) {
+ println("Merging monthly IDicts for " + opts.thisDir)
+ for (d <- opts.nstart to opts.nend) {
+ val (year, month, day) = Featurizer.decodeDate(d)
+ if (day == 31) { // Conditionally accumulate monthly dicts
+ val dfname = opts.fromMonthDir(d) + opts.localDict
+ if (fileExists(dfname) || ! mapit) {
+ mdict = if (mapit) Dict(loadBMat(dfname)) else null
+ val fm = new File(opts.fromMonthDir(d) + wcountname)
+ if (fm.exists) {
+ val bb = HMat.loadIMat(opts.fromMonthDir(d) + dictname) // Load the IDict data for this month
+ val cc = HMat.loadDMat(opts.fromMonthDir(d) + wcountname)
+ val bm = if (mapit) {
+ val map = mdict --> alldict
+ map(bb) // Map to global token dictionary
+ } else bb
+ val igood = find(mini(bm, 2) >= 0) // Save the good stuff
+ val bg = bm(igood,?)
+ val cg = cc(igood)
+ val ip = icol(0->igood.length)
+ sortlexInds(bg, ip)
+ IDict.treeAdd(IDict(bg, cg(ip), 4*opts.threshold), md)
+ println("%04d-%02d" format (year,month))
+ }
+ }
+ }
+ }
+ dy = IDict.treeFlush(md) // Final dictionary for the time period
+ println
+ val (sv, iv) = sortdown2(dy.counts) // Sort down by ngram frequency
+ val dyy = IDict(dy.grams(iv,?), sv)
+ saveIMat(opts.thisDir + dictname, dyy.grams)
+ saveDMat(opts.thisDir + wcountname, dyy.counts)
+ dy // Return the lex-sorted dictionary
+ } else {
+ val gyy = loadIMat(opts.thisDir + dictname)
+ val cyy = loadDMat(opts.thisDir + wcountname)
+ val iperm = icol(0->cyy.length)
+ sortlexInds(gyy, iperm)
+ IDict(gyy, cyy(iperm))
+ }
+ }
+
+
+ def mkIDicts(rebuild:Int, scanner:Scanner=TwitterScanner) = { // Build ngram dictionaries for each day
+ val nthreads = math.min(opts.nthreads, math.max(1, Mat.hasCUDA))
+ println("Building daily IDicts")
+ val done = izeros(nthreads,1)
+ for (ithread <- 0 until nthreads) {
+ Actor.actor {
+ if (Mat.hasCUDA > 0) setGPU(ithread+Mat.hasCUDA-nthreads)
+ val bigramsx = IMat(opts.guessSize, 3) // Temp storage for grams
+ val trigramsx = IMat(opts.guessSize, 4)
+ val useridsx = IMat(opts.guessSize/10, 2)
+ val bdicts = new Array[IDict](5) // Trees to hold partial merges
+ val tdicts = new Array[IDict](5)
+ val udicts = new Array[IDict](5)
+
+ for (d <- (opts.nstart+ithread) to opts.nend by nthreads) {
+ val (year, month, day) = Featurizer.decodeDate(d)
+ val fname = opts.fromDayDir(d)+opts.localDict
+ val fnew = opts.fromDayDir(d)+opts.usrCnts // Check if the userid dictionary was built yet
+ if (fileExists(fname) && (rebuild > 1 || !fileExists(fnew))) {
+ val dict = Dict(loadBMat(fname)) // load token dictionary for this day
+ for (ifile <- 0 until 24) {
+ val fn = opts.fromDayDir(d)+opts.fromFile(ifile)
+ if (fileExists(fn)) {
+ val idata = loadIMat(fn)
+ val (nuni, nbi, ntri, nusers) = scanner.scan(opts, dict, idata, null, bigramsx, trigramsx, useridsx)
+ val bigrams = bigramsx(0->nbi, 0->2)
+ val bid = if (nbi > 0) IDict.dictFromData(bigrams) else null
+ val trigrams = trigramsx(0->ntri, 0->3)
+ val trid = if (ntri > 0) IDict.dictFromData(trigrams) else null
+ val userids = useridsx(0->nusers, 0)
+ val uid = if (nusers > 0) IDict.dictFromData(userids) else null
+ IDict.treeAdd(bid, bdicts)
+ IDict.treeAdd(trid, tdicts)
+ IDict.treeAdd(uid, udicts)
+ }
+ }
+ val bf = IDict.treeFlush(bdicts)
+ val tf = IDict.treeFlush(tdicts)
+ val uf = IDict.treeFlush(udicts)
+ saveIMat(opts.fromDayDir(d) + opts.biDict, bf.grams)
+ saveDMat(opts.fromDayDir(d) + opts.biCnts, bf.counts)
+ saveIMat(opts.fromDayDir(d) + opts.triDict, tf.grams)
+ saveDMat(opts.fromDayDir(d) + opts.triCnts, tf.counts)
+ saveIMat(opts.fromDayDir(d) + opts.usrDict, uf.grams)
+ saveDMat(opts.fromDayDir(d) + opts.usrCnts, uf.counts)
+ print(".")
+ }
+ if (ithread == 0 && day/nthreads == 31/nthreads) println("%04d-%02d" format (year,month))
+ }
+ done(ithread,0) = 1
+ }
+ }
+ while (mini(done).v == 0) Thread.`yield`
+ }
+
+ def mkUniFeats(map:IMat, gramsx:IMat, ng:Int):IMat = {
+ val unis = map(gramsx(0->ng, 0))
+ val igood = find(unis >= 0)
+ val gg = unis(igood, 0)
+ val ggn = gramsx(igood, 1)
+ val feats = ggn \ gg
+ sortlex(feats)
+ val (outr, ix, iy) = uniquerows(feats)
+ val fcounts = (ix(1->ix.length, 0) on iy.length) - ix
+ outr \ fcounts
+ }
+
+ def mkGramFeats(map:IMat, gramsx:IMat, ng:Int, alldict:IDict):IMat = {
+ val grams = map(gramsx(0->ng, 0->(gramsx.ncols-1)))
+ val igood = find(mini(grams, 2) >= 0)
+ val gg = grams(igood,?)
+ val ggn = gramsx(igood, gramsx.ncols-1)
+ val gmap = IDict(gg) --> alldict
+ val igood2 = find(gmap >= 0)
+ val feats = ggn(igood2,0) \ gmap(igood2,0)
+ sortlex(feats)
+ val (outr, ix, iy) = uniquerows(feats)
+ val fcounts = (ix(1->ix.length, 0) on iy.length) - ix
+ outr \ fcounts
+ }
+
+ def featurize(rebuild:Int, scanner:Scanner=TwitterScanner) = {
+ println("Featurizing in " + opts.thisDir)
+ if (alldict == null) alldict = Dict(HMat.loadBMat(opts.mainDict))
+ if (allbdict == null) allbdict = IDict(HMat.loadIMat(opts.mainBDict))
+ if (alltdict == null) alltdict = IDict(HMat.loadIMat(opts.mainTDict))
+ alldict.makeHash
+ allbdict.makeSorted
+ alltdict.makeSorted
+ val nthreads = math.min(opts.nthreads, math.max(1, Mat.hasCUDA))
+ val done = izeros(nthreads,1)
+ for (ithread <- 0 until nthreads) {
+ Actor.actor {
+ if (Mat.hasCUDA > 0) setGPU(ithread+Mat.hasCUDA-nthreads)
+ val unigramsx = IMat(opts.guessSize, 2)
+ val bigramsx = IMat(opts.guessSize, 3)
+ val trigramsx = IMat(opts.guessSize, 4)
+ val userids = IMat(opts.guessSize/10, 2)
+ for (d <- (opts.nstart+ithread) to opts.nend by nthreads) {
+ val (year, month, day) = Featurizer.decodeDate(d)
+ val fdict = opts.fromDayDir(d)+opts.localDict
+ if (fileExists(fdict)) {
+ var dict:Dict = null
+ var map:IMat = null
+ val fd = new File(opts.toDayDir(d))
+ if (!fd.exists) fd.mkdirs
+ for (ifile <- 0 until 24) {
+ val fn = opts.fromDayDir(d)+opts.fromFile(ifile)
+ val fx = opts.toDayDir(d)+opts.toTriFeats(ifile)
+ if (fileExists(fn) && (rebuild > 0 || !fileExists(fx))) {
+ if (dict == null) {
+ dict = Dict(loadBMat(fdict))
+ map = dict --> alldict
+ }
+ val idata = loadIMat(fn)
+ val (nuni, nbi, ntri, nstatuses) = scanner.scan(opts, dict, idata, unigramsx, bigramsx, trigramsx, userids)
+ val unifeats = mkUniFeats(map, unigramsx, nuni)
+ val bifeats = mkGramFeats(map, bigramsx, nbi, allbdict)
+ val trifeats = mkGramFeats(map, trigramsx, ntri, alltdict)
+ saveIMat(opts.toDayDir(d) + opts.toUniFeats(ifile), unifeats)
+ saveIMat(opts.toDayDir(d) + opts.toBiFeats(ifile), bifeats)
+ saveIMat(opts.toDayDir(d) + opts.toTriFeats(ifile), trifeats)
+ saveIMat(opts.toDayDir(d) + opts.toUserids(ifile), userids(0->nstatuses, ?))
+ if (ifile == 23) print(".")
+ }
+ }
+ }
+ if (ithread == 0 && day/nthreads == 31/nthreads) println("%04d-%02d" format (year,month))
+ }
+ done(ithread,0) = 1
+ }
+ }
+ while (mini(done).v == 0) Thread.`yield`
+ }
+
+ def fileExists(fname:String) = {
+ val testme = new File(fname)
+ testme.exists
+ }
+
+ def loadDicts() = {
+ if (alldict == null) alldict = Dict(HMat.loadBMat(opts.mainDict))
+ if (allbdict == null) allbdict = IDict(HMat.loadIMat(opts.mainBDict))
+ if (alltdict == null) alltdict = IDict(HMat.loadIMat(opts.mainTDict))
+ val alld = alldict.cstr
+ val bg = allbdict.grams
+ val tg = alltdict.grams
+ val bd = CSMat(bg.nrows,1)
+ val td = CSMat(tg.nrows,1)
+ var i = 0
+ while (i < bg.nrows) {
+ bd(i) = alld(bg(i,0)) + " " + alld(bg(i,1))
+ i += 1
+ }
+ i = 0
+ while (i < tg.nrows) {
+ td(i) = (alld(tg(i,0)) + " " + alld(tg(i,1))) + (" " + alld(tg(i,2)))
+ i += 1
+ }
+ (alld, bd, td)
+ }
+}
+
+object Featurizer {
+
+ def alloptions = {
+ val ff = new Featurizer
+ val newopts = new Featurizer.Options{
+ override val tokDirName = "twitter/smiley/tokenized/"
+ override val featDirName = "twitter/smiley/featurized/"
+ }
+ val fs = new Featurizer(newopts)
+ (ff,fs)
+ }
+
+ /*
+ * Rebuild levels:
+ * 0: Incrementally build monthly Dicts and Idicts and featurize any new files. Dont rebuild dictionaries
+ * 1: Rebuild all dictionaries from monthlies, and rebuild all features.
+ * 2: Rebuild everything
+ */
+
+ def updateDicts(rebuild:Int=0) = {
+ val (ff,fs) = alloptions
+ ff.mergeDicts(rebuild)
+ fs.mergeDicts(rebuild)
+ ff.mkIDicts(rebuild)
+ fs.mkIDicts(rebuild)
+ }
+
+ def buildAll(rebuild:Int=0) = {
+ buildMainDict(rebuild)
+ buildMainGDicts(rebuild)
+ buildFeatures(rebuild)
+ }
+
+ def buildMainDict(rebuild:Int) = {
+ val (ff,fs) = alloptions
+ val d1 = ff.mergeDicts(rebuild)
+ val d2 = fs.mergeDicts(rebuild)
+ if (rebuild>0) {
+ val dd = Dict.union(d1, d2)
+ val (sc, ic) = sortdown2(dd.counts)
+ saveBMat(ff.opts.mainDict, BMat(dd.cstr(ic,0)))
+ saveDMat(ff.opts.mainCounts, sc)
+ }
+ }
+
+ def buildMainGDicts(rebuild:Int) = {
+ val (ff, fs) = alloptions
+
+ val bd1 = ff.mergeIDicts(rebuild)
+ val bd2 = fs.mergeIDicts(rebuild)
+ if (rebuild>0) {
+ val bdd = IDict.merge2(bd1,bd2)
+ val (sbc, ibc) = sortdown2(bdd.counts)
+ saveIMat(ff.opts.mainBDict, IMat(bdd.grams(ibc,?)))
+ saveDMat(ff.opts.mainBCounts, sbc)
+ }
+
+ val td1 = ff.mergeIDicts(rebuild, "tdict.lz4", "tcnts.lz4")
+ val td2 = fs.mergeIDicts(rebuild, "tdict.lz4", "tcnts.lz4")
+ if (rebuild>0) {
+ val tdd = IDict.merge2(td1,td2)
+ val (stc, itc) = sortdown2(tdd.counts)
+ saveIMat(ff.opts.mainTDict, IMat(tdd.grams(itc,?)))
+ saveDMat(ff.opts.mainTCounts, stc)
+ }
+
+ ff.opts.threshold = 1
+ fs.opts.threshold = 1
+ val usr1 = ff.mergeIDicts(rebuild, "usrdict.lz4", "usrcnts.lz4", false)
+ val usr2 = fs.mergeIDicts(rebuild, "usrdict.lz4", "usrcnts.lz4", false)
+ if (rebuild>0) {
+ val usr = IDict.merge2(usr1,usr2)
+ val (usrs, usrc) = sortdown2(usr.counts)
+ saveIMat(ff.opts.mainUsrDict, IMat(usr.grams(usrc,?)))
+ saveDMat(ff.opts.mainUsrCounts, usrs)
+ }
+ }
+
+ def buildFeatures(rebuild:Int) = {
+ val (ff, fs) = alloptions
+ fs.featurize(rebuild)
+ ff.featurize(rebuild)
+ }
+
+ def encodeDate(yy:Int, mm:Int, dd:Int) = (372*yy + 31*mm + dd)
+
+ def decodeDate(n:Int):(Int, Int, Int) = {
+ val yy = (n - 32) / 372
+ val days = n - 32 - 372 * yy
+ val mm = days / 31 + 1
+ val dd = days - 31 * (mm - 1) + 1
+ (yy, mm, dd)
+ }
+
+ def dirxMap(fname:String):(Int)=>String = {
+ (n:Int) => {
+ val (yy, mm, dd) = decodeDate(n)
+ (fname format (n % 16, yy, mm, dd))
+ }
+ }
+
+ def dirMap(fname:String):(Int)=>String = {
+ (n:Int) => {
+ val (yy, mm, dd) = decodeDate(n)
+ (fname format (yy, mm, dd))
+ }
+ }
+
+
+ class Options {
+ val tokDirName = "twitter/tokenized/"
+ val featDirName = "twitter/featurized/"
+ val localDict:String = "dict.gz"
+ val localCount:String = "wcount.gz"
+ val biDict:String = "bdict.lz4"
+ val triDict:String = "tdict.lz4"
+ val usrDict:String = "usrdict.lz4"
+ val biCnts:String = "bcnts.lz4"
+ val triCnts:String = "tcnts.lz4"
+ val usrCnts:String = "usrcnts.lz4"
+ def thisDir = "/big/" + tokDirName
+ def mainDir = "/big/twitter/tokenized/"
+ def mainDict:String = mainDir + "all" + localDict
+ def mainCounts:String = mainDir + "all" + localCount
+ def mainBDict:String = mainDir + "all" + biDict
+ def mainBCounts:String = mainDir + "all" + biCnts
+ def mainTDict:String = mainDir + "all" + triDict
+ def mainTCounts:String = mainDir + "all" + triCnts
+ def mainUsrDict:String = mainDir + "all" + usrDict
+ def mainUsrCounts:String = mainDir + "all" + usrCnts
+ def fromYearDir:(Int)=>String = dirMap(thisDir + "%04d/")
+ def fromMonthDir:(Int)=>String = dirMap(thisDir + "%04d/%02d/")
+ def fromDayDir:(Int)=>String = dirxMap("/disk%02d/" + tokDirName + "%04d/%02d/%02d/")
+ def toDayDir:(Int)=>String = dirxMap("/disk%02d/" + featDirName + "%04d/%02d/%02d/")
+ var fromFile:(Int)=>String = (n:Int) => ("tweet%02d.gz" format n)
+ var toUniFeats:(Int)=>String = (n:Int) => ("unifeats%02d.lz4" format n)
+ var toBiFeats:(Int)=>String = (n:Int) => ("bifeats%02d.lz4" format n)
+ var toTriFeats:(Int)=>String = (n:Int) => ("trifeats%02d.lz4" format n)
+ var toUserids:(Int)=>String = (n:Int) => ("userids%02d.lz4" format n)
+ var nstart:Int = encodeDate(2011,11,22)
+ var nend:Int = encodeDate(2013,6,31)
+ var threshold = 10
+ var guessSize = 200000000
+ var nthreads = 1
+ }
+}
+
+trait Scanner {
+ def scan(opts:Featurizer.Options, dict:Dict, idata:IMat, unigramsx:IMat, bigramsx:IMat, trigramsx:IMat, userids:IMat):(Int, Int, Int, Int)
+}
+
+object TwitterScanner extends Scanner {
+ final val OutsideStatus = 0
+ final val InsideStatus = 1
+ final val InsideUser = 2
+ final val InsideUserId = 3
+ final val InsideText = 4
+ final val InsideRetweet = 5
+ final val InsideStatusL2 = 6
+ final val InsideUserL2 = 7
+ final val InsideUserIdL2 = 8
+ final val InsideTextL2 = 9
+
+ def scan(opts:Featurizer.Options, dict:Dict, idata:IMat, unigramsx:IMat, bigramsx:IMat, trigramsx:IMat, userids:IMat):(Int, Int, Int, Int) = {
+
+ val Isstart = dict("")
+ val Isend = dict("")
+ val Irstart = dict("")
+ val Irend = dict("")
+ val Itstart = dict("")
+ val Itend = dict("")
+ val Iuser = dict("")
+ val Iuend = dict("")
+ val Iistart = dict("")
+ val Iiend = dict("")
+ var state = 0
+
+ var istatus = -1
+ var nuni = 0
+ var nbi = 0
+ var ntri = 0
+ var len = idata.length
+ var i = 0
+ while (i < len) {
+ val tok = idata.data(i)-1
+// if (tok+1 >0) println(dict(tok)+ " " + state)
+// else println("num " +(-(tok+1))+ " " + state)
+ if (tok == Isend) {
+ state = OutsideStatus
+ } else {
+ (state: @switch) match {
+ case OutsideStatus =>
+ if (tok == Isstart) {
+ state = InsideStatus
+ istatus += 1
+ }
+ case InsideStatus =>
+ tok match {
+ case Iuser => state = InsideUser
+ case Itstart => state = InsideText
+ case Irstart => state = InsideRetweet
+ case _ => {}
+ }
+ case InsideUser =>
+ tok match {
+ case Iistart => state = InsideUserId
+ case Irstart => state = InsideRetweet
+ case Iuend => state = InsideStatus
+ case _ => {}
+ }
+ case InsideUserId =>
+ if (tok == Iiend) {
+ state = InsideUser
+ } else if (tok+1 < 0) {
+ if (userids != null) {
+ userids(istatus,0) = -(tok+1)
+ userids(istatus,1) = 0
+ }
+ }
+ case InsideText =>
+ tok match {
+ case Iuser => state = InsideUser
+ case Itend => state = InsideStatus
+ case _ => if (tok+1 > 0) {
+ if (unigramsx != null) {
+ unigramsx(nuni, 0) = tok
+ unigramsx(nuni, 1) = istatus
+ nuni += 1
+ }
+ if (idata.data(i-1) > 0) {
+ val tok1 = idata.data(i-1)-1
+ if (tok1 != Itstart) {
+ bigramsx(nbi, 0) = tok1
+ bigramsx(nbi, 1) = tok
+ bigramsx(nbi, 2) = istatus
+ nbi += 1
+ if (idata.data(i-2) > 0) {
+ val tok2 = idata.data(i-2)-1
+ if (tok2 != Itstart) {
+ trigramsx(ntri, 0) = tok2
+ trigramsx(ntri, 1) = tok1
+ trigramsx(ntri, 2) = tok
+ trigramsx(ntri, 3) = istatus
+ ntri += 1
+ }
+ }
+ }
+ }
+ }
+ }
+ case InsideRetweet =>
+ tok match {
+ case Isstart => state = InsideStatusL2
+ case Irend => state = InsideStatus
+ case _ => {}
+ }
+ case InsideStatusL2 =>
+ tok match {
+ case Iuser => state = InsideUserL2
+ case Itstart => state = InsideTextL2
+ case _ => {}
+ }
+ case InsideUserL2 =>
+ tok match {
+ case Iistart => state = InsideUserIdL2
+ case Iuend => state = InsideStatusL2
+ case _ => {}
+ }
+ case InsideUserIdL2 =>
+ tok match {
+ case Iiend => state = InsideUserL2
+ case _ => if (tok-1 < 0) {
+ if (userids != null) userids(istatus, 1) = -(tok+1)
+ }
+ }
+ case InsideTextL2 =>
+ tok match {
+ case Itend => state = InsideStatusL2
+ case Iuser => state = InsideUserL2
+ case _ => {}
+ }
+ case _ => {}
+ }
+
+ }
+ i += 1
+ }
+ (nuni, nbi, ntri, istatus)
+ }
+}
\ No newline at end of file
diff --git a/src/main/scala/BIDMach/GLMmodel.scala b/src/main/scala/BIDMach/GLMmodel.scala
new file mode 100755
index 00000000..65a3a63c
--- /dev/null
+++ b/src/main/scala/BIDMach/GLMmodel.scala
@@ -0,0 +1,195 @@
+package BIDMach
+
+import BIDMat.{Mat,BMat,CMat,DMat,FMat,IMat,HMat,GMat,GIMat,GSMat,SMat,SDMat}
+import BIDMat.MatFunctions._
+import BIDMat.SciFunctions._
+import edu.berkeley.bid.CUMAT
+
+
+class GLMmodel(opts:GLMmodel.Options) extends RegressionModel(opts) {
+
+ var mylinks:Mat = null
+
+ val linkArray = Array[GLMlink](LinearLink, LogisticLink)
+
+ var totflops = 0L
+
+ override def init(datasource:DataSource) = {
+ super.init(datasource)
+ mylinks = if (useGPU) GIMat(opts.links) else opts.links
+ modelmats(0) ~ modelmats(0) ∘ mask
+ totflops = 0L
+ for (i <- 0 until opts.links.length) {
+ totflops += linkArray(opts.links(i)).fnflops
+ }
+ }
+
+ def mupdate(in:Mat):FMat = {
+// println("model %f" format (mean(mean(modelmats(0)))).dv)
+ val targs = targets * in
+ min(targs, 1f, targs)
+ val alltargs = targmap * targs
+ val eta = modelmats(0) * in
+ applymeans(eta, mylinks, eta)
+// println("pred %f" format (mean(mean(pred))).dv)
+// println("%s %s %s %s %s" format (modelmats(0).mytype, updatemats(0).mytype, alltargs.mytype, pred.mytype, in.mytype))
+ val lls = llfun(eta, alltargs, mylinks)
+ alltargs ~ alltargs - eta
+ updatemats(0) ~ alltargs *^ in
+ lls
+ }
+
+ def applymeans(eta:Mat, links:Mat, out:Mat):Mat = {
+ (eta, links, out) match {
+ case (feta:FMat, ilinks:IMat, fout:FMat) => {
+ Mat.nflops += totflops * feta.ncols
+ var i = 0
+ val out = (feta + 3f)
+ while (i < feta.ncols) {
+ var j = 0
+ while (j < feta.nrows) {
+ val fun = linkArray(ilinks(j)).invlinkfn
+ fout.data(j + i * out.nrows) = fun(feta.data(j + i * feta.nrows))
+ j += 1
+ }
+ i += 1
+ }
+ out
+ }
+ case (geta:GMat, gilinks:GIMat, gout:GMat) => {
+ Mat.nflops += totflops * geta.ncols
+ CUMAT.applymeans(geta.data, gilinks.data, gout.data, geta.nrows, geta.ncols)
+ out
+ }
+ }
+ }
+
+ def llfun(pred:Mat, targ:Mat, links:Mat):FMat = {
+ (pred, targ, links) match {
+ case (fpred:FMat, ftarg:FMat, ilinks:IMat) => {
+ Mat.nflops += 10L * ftarg.length
+ var i = 0
+ val out = (ftarg + 5f)
+ while (i < ftarg.ncols) {
+ var j = 0
+ while (j < ftarg.nrows) {
+ val fun = linkArray(ilinks(j)).likelihoodfn
+ out.data(j + i * out.nrows) = fun(fpred.data(j + i * ftarg.nrows), ftarg.data(j + i * ftarg.nrows))
+ j += 1
+ }
+ i += 1
+ }
+ mean(out,2)
+ }
+ case (gpred:GMat, gtarg:GMat, gilinks:GIMat) => {
+ Mat.nflops += totflops * gpred.ncols
+ val out = (gpred + 3f)
+ CUMAT.applylls(gpred.data, gtarg.data, gilinks.data, out.data, gpred.nrows, gpred.ncols)
+ FMat(mean(out,2))
+ }
+ }
+ }
+
+}
+
+
+object LinearLink extends GLMlink {
+ def link(in:Float) = {
+ in
+ }
+
+ def invlink(in:Float) = {
+ in
+ }
+
+ def dlink(in:Float) = {
+ 1.0f
+ }
+
+ def likelihood(pred:Float, targ:Float) = {
+ val diff = targ - pred
+ - diff * diff
+ }
+
+ override val linkfn = link _
+
+ override val dlinkfn = dlink _
+
+ override val invlinkfn = invlink _
+
+ override val likelihoodfn = likelihood _
+
+ val fnflops = 2
+}
+
+object LogisticLink extends GLMlink {
+ def link(in:Float) = {
+ math.log(in / (1.0f - in)).toFloat
+ }
+
+ def invlink(in:Float) = {
+ if (in > 0) {
+ val tmp = math.exp(-in)
+ (1.0 / (1.0 + tmp)).toFloat
+ } else {
+ val tmp = math.exp(in)
+ (tmp / (1.0 + tmp)).toFloat
+ }
+ }
+
+ def dlink(in:Float) = {
+ 1 / (in * (1 - in))
+ }
+
+ def likelihood(pred:Float, targ:Float) = {
+ math.log(targ * pred + (1.0f - targ) * (1.0f - pred) + 1e-20).toFloat
+ }
+
+ override val linkfn = link _
+
+ override val dlinkfn = dlink _
+
+ override val invlinkfn = invlink _
+
+ override val likelihoodfn = likelihood _
+
+ val fnflops = 20
+}
+
+object LinkEnum extends Enumeration {
+ type LinkEnum = Value
+ val Linear, Logistic = Value
+}
+
+abstract class GLMlink {
+ val linkfn:(Float => Float)
+ val dlinkfn:(Float => Float)
+ val invlinkfn:(Float => Float)
+ val likelihoodfn:((Float,Float) => Float)
+ val fnflops:Int
+}
+
+object GLMmodel {
+ class Options extends RegressionModel.Options {
+ var links:IMat = null
+ }
+
+ def mkGLMmodel(fopts:Model.Options) = {
+ new GLMmodel(fopts.asInstanceOf[GLMmodel.Options])
+ }
+
+ def mkUpdater(nopts:Updater.Options) = {
+ new ADAGradUpdater(nopts.asInstanceOf[ADAGradUpdater.Options])
+ }
+
+ def learnFParx(
+ nstart:Int=FilesDataSource.encodeDate(2012,3,1,0),
+ nend:Int=FilesDataSource.encodeDate(2012,12,1,0)
+ ) = {
+ new LearnFParModelx(
+ SFilesDataSource.twitterNgramBlend(nstart, nend, 1, 0),
+ new GLMmodel.Options, mkGLMmodel _,
+ new ADAGradUpdater.Options, mkUpdater _)
+ }
+}
+
diff --git a/src/main/scala/BIDMach/LDAModel.scala b/src/main/scala/BIDMach/LDAModel.scala
new file mode 100755
index 00000000..8b445f67
--- /dev/null
+++ b/src/main/scala/BIDMach/LDAModel.scala
@@ -0,0 +1,118 @@
+package BIDMach
+
+import BIDMat.{Mat,BMat,CMat,DMat,FMat,IMat,HMat,GMat,GIMat,GSMat,SMat,SDMat}
+import BIDMat.MatFunctions._
+import BIDMat.SciFunctions._
+
+
+class LDAModel(override val opts:LDAModel.Options = new LDAModel.Options) extends FactorModel(opts) {
+ var mm:Mat = null
+ var alpha:Mat = null
+
+ var traceMem = false
+
+ override def init(datasource:DataSource) = {
+ super.init(datasource)
+ mm = modelmats(0)
+ modelmats = new Array[Mat](2)
+ modelmats(0) = mm
+ modelmats(1) = mm.ones(mm.nrows, 1)
+ updatemats = new Array[Mat](2)
+ updatemats(0) = mm.zeros(mm.nrows, mm.ncols)
+ updatemats(1) = mm.zeros(mm.nrows, 1)
+ }
+
+ def uupdate(sdata:Mat, user:Mat):Unit = {
+ if (opts.putBack < 0) user.set(1f)
+ for (i <- 0 until opts.uiter) {
+ val preds = DDS(mm, user, sdata)
+ if (traceMem) println("uupdate %d %d %d, %d %f %d" format (mm.GUID, user.GUID, sdata.GUID, preds.GUID, GPUmem._1, getGPU))
+ val dc = sdata.contents
+ val pc = preds.contents
+ max(opts.weps, pc, pc)
+ pc ~ dc / pc
+ val unew = user ∘ (mm * preds) + opts.alpha
+ if (traceMem) println("uupdate %d %d %d, %d %d %d %d %f %d" format (mm.GUID, user.GUID, sdata.GUID, preds.GUID, dc.GUID, pc.GUID, unew.GUID, GPUmem._1, getGPU))
+ if (opts.exppsi) exppsi(unew, unew)
+ user <-- unew
+ }
+// println("user %g %g" format (mini(mini(user,1),2).dv, maxi(maxi(user,1),2).dv))
+ }
+
+ def mupdate(sdata:Mat, user:Mat):Unit = {
+ val preds = DDS(mm, user, sdata)
+ val dc = sdata.contents
+ val pc = preds.contents
+ max(opts.weps, pc, pc)
+ pc ~ dc / pc
+ val ud = user *^ preds
+ ud ~ ud ∘ mm
+ ud ~ ud + opts.beta
+ updatemats(0) <-- ud
+ sum(ud, 2, updatemats(1))
+ if (traceMem) println("mupdate %d %d %d %d" format (sdata.GUID, user.GUID, ud.GUID, updatemats(0).GUID))
+ }
+
+ def evalfun(sdata:Mat, user:Mat):FMat = {
+ val preds = DDS(mm, user, sdata)
+ val dc = sdata.contents
+ val pc = preds.contents
+ max(opts.weps, pc, pc)
+ ln(pc, pc)
+ val sdat = sum(sdata,1)
+ val mms = sum(mm,2)
+ val suu = ln(mms ^* user)
+ if (traceMem) println("evalfun %d %d %d, %d %d %d, %d %f" format (sdata.GUID, user.GUID, preds.GUID, pc.GUID, sdat.GUID, mms.GUID, suu.GUID, GPUmem._1))
+ val vv = ((pc ddot dc) - (sdat ddot suu))/sum(sdat,2).dv
+ row(vv, math.exp(-vv))
+ }
+}
+
+object LDAModel {
+ class Options extends FactorModel.Options {
+ var LDAeps = 1e-9
+ var exppsi = true
+ var alpha = 0.001f
+ var beta = 0.0001f
+ }
+
+ def mkLDAmodel(fopts:Model.Options) = {
+ new LDAModel(fopts.asInstanceOf[LDAModel.Options])
+ }
+
+ def mkUpdater(nopts:Updater.Options) = {
+ new IncNormUpdater(nopts.asInstanceOf[IncNormUpdater.Options])
+ }
+
+ def learn(mat0:Mat) = {
+ new Learner(new MatDataSource(Array(mat0:Mat)), new LDAModel(), null, new IncNormUpdater(), new Learner.Options)
+ }
+
+ def learnBatch(mat0:Mat) = {
+ new Learner(new MatDataSource(Array(mat0:Mat)), new LDAModel(), null, new BatchNormUpdater(), new Learner.Options)
+ }
+
+ def learnFPar(
+ nstart:Int=FilesDataSource.encodeDate(2012,3,1,0),
+ nend:Int=FilesDataSource.encodeDate(2012,12,1,0)
+ ) = {
+ new LearnFParModel(
+ new LDAModel.Options, mkLDAmodel _,
+ new IncNormUpdater.Options, mkUpdater _,
+ (n:Int, i:Int) => SFilesDataSource.twitterWords(nstart, nend, n, i)
+ )
+ }
+
+ def learnFParx(
+ nstart:Int=FilesDataSource.encodeDate(2012,3,1,0),
+ nend:Int=FilesDataSource.encodeDate(2012,12,1,0)
+ ) = {
+ new LearnFParModelx(
+ SFilesDataSource.twitterWords(nstart, nend),
+ new LDAModel.Options, mkLDAmodel _,
+ new IncNormUpdater.Options, mkUpdater _
+ )
+ }
+}
+
+
diff --git a/src/main/scala/BIDMach/Learner.scala b/src/main/scala/BIDMach/Learner.scala
new file mode 100755
index 00000000..2eb39f17
--- /dev/null
+++ b/src/main/scala/BIDMach/Learner.scala
@@ -0,0 +1,509 @@
+package BIDMach
+import BIDMat.{Mat,BMat,CMat,DMat,FMat,IMat,HMat,GMat,GIMat,GSMat,SMat,SDMat}
+import BIDMat.MatFunctions._
+import BIDMat.SciFunctions._
+import BIDMat.Plotting._
+import scala.collection.immutable.List
+import scala.collection.mutable.ListBuffer
+import scala.actors.Actor
+
+case class Learner(
+ val datasource:DataSource,
+ val model:Model,
+ val regularizer:Regularizer,
+ val updater:Updater,
+ val opts:Learner.Options = new Learner.Options) {
+ var results:FMat = null
+ val dopts:DataSource.Options = datasource.opts
+ val mopts:Model.Options = model.opts
+ val ropts:Regularizer.Options = if (regularizer != null) regularizer.opts else null
+ val uopts:Updater.Options = updater.opts
+
+ def setup = {
+ datasource match {
+ case ddm:MatDataSource => {
+ if (mopts.putBack >= 0) {
+ ddm.setupPutBack(mopts.putBack+1, mopts.dim)
+ }
+ }
+ case _ => {}
+ }
+ datasource.init
+ model.init(datasource)
+ updater.init(model)
+ }
+
+ def init = {
+ datasource.init
+ model.init(datasource)
+ updater.init(model)
+ }
+
+ def run() = {
+ flip
+ var done = false
+ var ipass = 0
+ var here = 0L
+ var lasti = 0
+ var bytes = 0L
+ updater.clear
+ val reslist = new ListBuffer[FMat]
+ val samplist = new ListBuffer[Float]
+ while (ipass < opts.npasses && ! done) {
+ var lastp = 0f
+ datasource.reset
+ var istep = 0
+ println("i=%2d" format ipass)
+ while (datasource.hasNext) {
+ val mats = datasource.next
+ here += datasource.opts.blockSize
+ bytes += 12L*mats(0).nnz
+ if ((istep - 1) % opts.evalStep == 0 || ! datasource.hasNext) {
+ val scores = model.evalblockg(mats)
+ reslist.append(scores.newcopy)
+ samplist.append(here)
+ } else {
+ model.doblockg(mats, here)
+ if (regularizer != null) regularizer.compute(here)
+ updater.update(here)
+ }
+ if (model.opts.putBack >= 0) datasource.putBack(mats, model.opts.putBack)
+ istep += 1
+ val dsp = datasource.progress
+ if (dsp > lastp + opts.pstep && reslist.length > lasti) {
+ val gf = gflop
+ lastp = dsp - (dsp % opts.pstep)
+ print("%5.2f%%, %s, gf=%5.3f, secs=%3.1f, GB=%4.2f, MB/s=%5.2f" format (
+ 100f*lastp,
+ Learner.scoreSummary(reslist, lasti, reslist.length),
+ gf._1,
+ gf._2,
+ bytes*1e-9,
+ bytes/gf._2*1e-6))
+ if (model.useGPU) {
+ print(", GPUmem=%3.2f" format GPUmem._1)
+ }
+ println
+ lasti = reslist.length
+ }
+ }
+ updater.updateM
+ ipass += 1
+ }
+ val gf = gflop
+ println("Time=%5.4f secs, gflops=%4.2f" format (gf._2, gf._1))
+ results = Learner.scores2FMat(reslist) on row(samplist.toList)
+ }
+}
+
+case class ParLearner(
+ val datasources:Array[DataSource],
+ val models:Array[Model],
+ val regularizers:Array[Regularizer],
+ val updaters:Array[Updater],
+ val opts:Learner.Options = new Learner.Options) {
+
+ var um:FMat = null
+ var mm:FMat = null
+ var results:FMat = null
+
+ def run() = {
+ flip
+ val mm0 = models(0).modelmats(0)
+ mm = zeros(mm0.nrows, mm0.ncols)
+ um = zeros(mm0.nrows, mm0.ncols)
+
+ @volatile var done = izeros(opts.nthreads, 1)
+ var ipass = 0
+ var istep0 = 0L
+ var ilast0 = 0L
+ var bytes = 0L
+ val reslist = new ListBuffer[FMat]
+ val samplist = new ListBuffer[Float]
+ var lastp = 0f
+ done.clear
+ for (ithread <- 0 until opts.nthreads) {
+ Actor.actor {
+ if (ithread < Mat.hasCUDA) setGPU(ithread)
+ var here = 0L
+ updaters(ithread).clear
+ while (ipass < opts.npasses) {
+ if (ithread == 0) println("i=%2d" format ipass)
+ datasources(ithread).reset
+ var istep = 0
+ var lasti = 0
+ while (datasources(ithread).hasNext) {
+ val mats = datasources(ithread).next
+ here += datasources(ithread).opts.blockSize
+ for (j <- 0 until mats.length) bytes += 12L * mats(j).nnz
+ istep += 1
+ istep0 += 1
+ try {
+ if (istep % opts.evalStep == 0) {
+ val scores = models(ithread).synchronized {models(ithread).evalblockg(mats)}
+ reslist.append(scores)
+ samplist.append(here)
+ } else {
+ models(ithread).synchronized {
+ models(ithread).doblockg(mats, here)
+ if (regularizers != null && regularizers(ithread) != null) regularizers(ithread).compute(here)
+ updaters(ithread).update(here)
+ }
+ }
+ } catch {
+ case e:Exception => {
+ print("Caught exception in thread %d %s\nTrying restart..." format (ithread, e.toString))
+ restart(ithread)
+ println("Keep on truckin...")
+ }
+ }
+ Thread.sleep(opts.coolit)
+ if (models(ithread).opts.putBack >= 0) datasources(ithread).putBack(mats, models(ithread).opts.putBack)
+// if (istep % (opts.syncStep/opts.nthreads) == 0) syncmodel(models, ithread)
+ if (ithread == 0 && datasources(0).progress > lastp + opts.pstep) {
+ lastp += opts.pstep
+ val gf = gflop
+ if (reslist.length > lasti) {
+ print("%5.2f%%, %s, gf=%5.3f, secs=%3.1f, GB=%4.2f, MB/s=%5.2f" format (
+ 100f*lastp,
+ Learner.scoreSummary(reslist, lasti, reslist.length),
+ gf._1,
+ gf._2,
+ bytes*1e-9,
+ bytes/gf._2*1e-6))
+ if (models(0).useGPU) {
+ for (i <- 0 until math.min(opts.nthreads, Mat.hasCUDA)) {
+ setGPU(i)
+ if (i==0) print(", GPUmem=%3.2f" format GPUmem._1) else print(", %3.2f" format GPUmem._1)
+ }
+ }
+ println
+ }
+ lasti = reslist.length
+ }
+ }
+ models(ithread).synchronized {updaters(ithread).updateM}
+ done(ithread) = ipass + 1
+ while (done(ithread) > ipass) Thread.sleep(1)
+ }
+ }
+ }
+ while (ipass < opts.npasses) {
+ while (mini(done).v == ipass) {
+ while (istep0 < ilast0 + opts.syncStep) Thread.sleep(1)
+ syncmodels(models)
+ ilast0 += opts.syncStep
+ }
+ ipass += 1
+ }
+ val gf = gflop
+ println("Time=%5.4f secs, gflops=%4.2f, MB/s=%5.2f, GB=%5.2f" format (gf._2, gf._1, bytes/gf._2*1e-6, bytes*1e-9))
+ results = Learner.scores2FMat(reslist) on row(samplist.toList)
+ }
+
+ def syncmodels(models:Array[Model]) = {
+ for (j <- 0 until models(0).modelmats.length) {
+ mm.clear
+ for (i <- 0 until models.length) {
+ if (i < Mat.hasCUDA) setGPU(i)
+ models(i).synchronized {
+ um <-- models(i).modelmats(j)
+ }
+ mm ~ mm + um
+ }
+ mm ~ mm *@ (1f/models.length)
+ for (i <- 0 until models.length) {
+ if (i < Mat.hasCUDA) setGPU(i)
+ models(i).synchronized {
+ models(i).modelmats(j) <-- mm
+ }
+ }
+ }
+ if (0 < Mat.hasCUDA) setGPU(0)
+ }
+
+ def syncmodel(models:Array[Model], ithread:Int) = {
+ mm.synchronized {
+ um <-- models(ithread).modelmats(0)
+ um ~ um *@ (1f/opts.nthreads)
+ mm ~ mm *@ (1 - 1f/opts.nthreads)
+ mm ~ mm + um
+ models(ithread).modelmats(0) <-- mm
+ }
+ }
+
+ def restart(ithread:Int) = {
+ if (models(0).useGPU) {
+ resetGPU
+ Mat.trimCache2(ithread)
+ }
+ models(ithread).init(datasources(ithread))
+ models(ithread).modelmats(0) <-- mm
+ updaters(ithread).init(models(ithread))
+ }
+}
+
+case class ParLearnerx(
+ val datasource:DataSource,
+ val models:Array[Model],
+ val regularizers:Array[Regularizer],
+ val updaters:Array[Updater],
+ val opts:Learner.Options = new Learner.Options) {
+
+ var um:FMat = null
+ var mm:FMat = null
+ var results:FMat = null
+ var cmats:Array[Array[Mat]] = null
+
+ def run() = {
+ flip
+ val mm0 = models(0).modelmats(0)
+ mm = zeros(mm0.nrows, mm0.ncols)
+ um = zeros(mm0.nrows, mm0.ncols)
+ cmats = new Array[Array[Mat]](opts.nthreads)
+ for (i <- 0 until opts.nthreads) cmats(i) = new Array[Mat](datasource.omats.length)
+
+ val done = iones(opts.nthreads, 1)
+ var ipass = 0
+ var here = 0L
+ var feats = 0L
+ var lasti = 0
+ var bytes = 0L
+ val reslist = new ListBuffer[FMat]
+ val samplist = new ListBuffer[Float]
+ for (i <- 0 until opts.nthreads) {
+ if (i < Mat.hasCUDA) setGPU(i)
+ updaters(i).clear
+ }
+ while (ipass < opts.npasses) {
+ datasource.reset
+ var istep = 0
+ var lastp = 0f
+ println("i=%2d" format ipass)
+ while (datasource.hasNext) {
+ for (ithread <- 0 until opts.nthreads) {
+ if (datasource.hasNext) {
+ done(ithread) = 0
+ val mats = datasource.next
+ here += datasource.opts.blockSize
+ feats += mats(0).nnz
+ bytes += 12L*mats(0).nnz
+ for (j <- 0 until mats.length) cmats(ithread)(j) = safeCopy(mats(j), ithread)
+ Actor.actor {
+ if (ithread < Mat.hasCUDA) setGPU(ithread)
+ try {
+ if ((istep + ithread + 1) % opts.evalStep == 0 || !datasource.hasNext ) {
+ val scores = models(ithread).evalblockg(cmats(ithread))
+ reslist.append(scores(0))
+ samplist.append(here)
+ } else {
+ models(ithread).doblockg(cmats(ithread), here)
+ if (regularizers != null && regularizers(ithread) != null) regularizers(ithread).compute(here)
+ updaters(ithread).update(here)
+ }
+ } catch {
+ case e:Exception => {
+ print("Caught exception in thread %d %s\nTrying restart..." format (ithread, e.toString))
+ restart(ithread)
+ println("Keep on truckin...")
+ }
+ }
+ done(ithread) = 1
+ }
+ }
+ }
+ while (mini(done).v == 0) Thread.sleep(1)
+ Thread.sleep(opts.coolit)
+ istep += opts.nthreads
+ if (istep % opts.syncStep == 0) syncmodels(models)
+ if (datasource.progress > lastp + opts.pstep) {
+ lastp += opts.pstep
+ val gf = gflop
+ if (reslist.length > lasti) {
+ print("%5.2f%%, %s, gf=%5.3f, secs=%3.1f, GB=%4.2f, MB/s=%5.2f" format (
+ 100f*lastp,
+ Learner.scoreSummary(reslist, lasti, reslist.length),
+ gf._1,
+ gf._2,
+ bytes*1e-9,
+ bytes/gf._2*1e-6))
+ if (models(0).useGPU) {
+ for (i <- 0 until math.min(opts.nthreads, Mat.hasCUDA)) {
+ setGPU(i)
+ if (i==0) print(", GPUmem=%3.2f" format GPUmem._1) else print(", %3.2f" format GPUmem._1)
+ }
+ }
+ println
+ }
+ lasti = reslist.length
+ }
+ }
+ println
+ for (i <- 0 until opts.nthreads) {
+ if (i < Mat.hasCUDA) setGPU(i);
+ updaters(i).updateM
+ }
+ ipass += 1
+ saveAs("/big/twitter/test/results.mat", Learner.scores2FMat(reslist) on row(samplist.toList), "results")
+ }
+ val gf = gflop
+ println("Time=%5.4f secs, gflops=%4.2f, samples=%4.2g, MB/sec=%4.2g" format (gf._2, gf._1, 1.0*here, bytes/gf._2/1e6))
+ results = Learner.scores2FMat(reslist) on row(samplist.toList)
+ if (0 < Mat.hasCUDA) setGPU(0)
+ }
+
+ def safeCopy(m:Mat, ithread:Int):Mat = {
+ m match {
+ case ss:SMat => {
+ val out = SMat.newOrCheckSMat(ss.nrows, ss.ncols, ss.nnz, null, m.GUID, ithread, "safeCopy".##)
+ ss.copyTo(out)
+ }
+ }
+ }
+
+ def syncmodels(models:Array[Model]) = {
+ for (j <- 0 until models(0).modelmats.length) {
+ mm.clear
+ for (i <- 0 until models.length) {
+ if (i < Mat.hasCUDA) setGPU(i)
+ um <-- models(i).modelmats(j)
+ mm ~ mm + um
+ }
+ mm ~ mm *@ (1f/models.length)
+ for (i <- 0 until models.length) {
+ if (i < Mat.hasCUDA) setGPU(i)
+ models(i).modelmats(j) <-- mm
+ }
+ }
+ if (0 < Mat.hasCUDA) setGPU(0)
+ }
+
+ def restart(ithread:Int) = {
+ if (models(0).useGPU) {
+ resetGPU
+ Mat.trimCaches(ithread)
+ }
+ models(ithread).init(datasource)
+ models(ithread).modelmats(0) <-- mm
+ updaters(ithread).init(models(ithread))
+ }
+}
+
+
+class LearnFParModel(
+ val mopts:Model.Options,
+ mkmodel:(Model.Options)=>Model,
+ val uopts:Updater.Options,
+ mkupdater:(Updater.Options)=>Updater,
+ ddfun:(Int,Int)=>DataSource
+ ) {
+ var dds:Array[DataSource] = null
+ var models:Array[Model] = null
+ var updaters:Array[Updater] = null
+ var learner:ParLearner = null
+ var lopts = new Learner.Options
+
+ def setup = {
+ dds = new Array[DataSource](lopts.nthreads)
+ models = new Array[Model](lopts.nthreads)
+ updaters = new Array[Updater](lopts.nthreads)
+ for (i <- 0 until lopts.nthreads) {
+ if (i < Mat.hasCUDA) setGPU(i)
+ dds(i) = ddfun(lopts.nthreads, i)
+ dds(i).init
+ models(i) = mkmodel(mopts)
+ models(i).init(dds(i))
+ updaters(i) = mkupdater(uopts)
+ updaters(i).init(models(i))
+ }
+ if (0 < Mat.hasCUDA) setGPU(0)
+ learner = new ParLearner(dds, models, null, updaters, lopts)
+ }
+
+ def init = {
+ for (i <- 0 until lopts.nthreads) {
+ if (i < Mat.hasCUDA) setGPU(i)
+ if (dds(i).omats.length > 1) dds(i).omats(1) = ones(mopts.dim, dds(i).omats(0).ncols)
+ dds(i).init
+ models(i).init(dds(i))
+ updaters(i).init(models(i))
+ }
+ if (0 < Mat.hasCUDA) setGPU(0)
+ }
+
+ def run = learner.run
+}
+
+
+class LearnFParModelx(
+ val ds:DataSource,
+ val mopts:Model.Options,
+ mkmodel:(Model.Options)=>Model,
+ val uopts:Updater.Options,
+ mkupdater:(Updater.Options)=>Updater) {
+ var models:Array[Model] = null
+ var updaters:Array[Updater] = null
+ var learner:ParLearnerx = null
+ var lopts = new Learner.Options
+
+ def setup = {
+ models = new Array[Model](lopts.nthreads)
+ updaters = new Array[Updater](lopts.nthreads)
+ ds.init
+ for (i <- 0 until lopts.nthreads) {
+ if (i < Mat.hasCUDA) setGPU(i)
+ models(i) = mkmodel(mopts)
+ models(i).init(ds)
+ updaters(i) = mkupdater(uopts)
+ updaters(i).init(models(i))
+ }
+ if (0 < Mat.hasCUDA) setGPU(0)
+ learner = new ParLearnerx(ds, models, null, updaters, lopts)
+ }
+
+ def init = {
+ ds.omats(1) = ones(mopts.dim, ds.omats(0).ncols)
+ for (i <- 0 until lopts.nthreads) {
+ if (i < Mat.hasCUDA) setGPU(i)
+ if (ds.omats.length > 1)
+ ds.init
+ models(i).init(ds)
+ updaters(i).init(models(i))
+ }
+ if (0 < Mat.hasCUDA) setGPU(0)
+ }
+ def run = learner.run
+}
+
+object Learner {
+
+ class Options {
+ var npasses = 10
+ var evalStep = 15
+ var syncStep = 32
+ var nthreads = 4
+ var pstep = 0.01f
+ var coolit = 60
+ }
+
+ def scoreSummary(reslist:ListBuffer[FMat], lasti:Int, length:Int):String = {
+ var i = lasti
+ var sum = 0.0
+ while (i < length) {
+ sum += reslist(i)(0)
+ i += 1
+ }
+ ("ll=%5.3f" format sum/(length-lasti))
+ }
+
+ def scores2FMat(reslist:ListBuffer[FMat]):FMat = {
+ val out = FMat(reslist(0).length, reslist.length)
+ var i = 0
+ while (i < reslist.length) {
+ out(?, i) = reslist(i)
+ i += 1
+ }
+ out
+ }
+}
+
diff --git a/src/main/scala/BIDMach/Model.scala b/src/main/scala/BIDMach/Model.scala
new file mode 100755
index 00000000..8e166fa9
--- /dev/null
+++ b/src/main/scala/BIDMach/Model.scala
@@ -0,0 +1,69 @@
+package BIDMach
+import BIDMat.{Mat,BMat,CMat,CSMat,DMat,FMat,GMat,GIMat,GSMat,HMat,IMat,SMat,SDMat}
+import BIDMat.MatFunctions._
+import BIDMat.SciFunctions._
+
+abstract class Model(val opts:Model.Options = new Model.Options) {
+
+ var modelmats:Array[Mat] = null
+
+ var updatemats:Array[Mat] = null
+
+ var mats:Array[Mat] = null
+
+ var gmats:Array[Mat] = null
+
+ var useGPU = false
+
+ def init(datasource:DataSource):Unit = {
+ mats = datasource.next
+ datasource.reset
+ useGPU = opts.useGPU && Mat.hasCUDA > 0
+ if (useGPU) {
+ gmats = new Array[Mat](mats.length)
+ for (i <- 0 until mats.length) {
+ gmats(i) = mats(i) match {
+ case aa:FMat => GMat(aa)
+ case aa:SMat => GSMat(aa)
+ }
+ }
+ } else {
+ gmats = mats
+ }
+ }
+
+ def doblock(mats:Array[Mat], i:Long) // Calculate an update for the updater
+
+ def evalblock(mats:Array[Mat]):FMat // Scores (log likelihoods)
+
+ def doblockg(amats:Array[Mat], i:Long) = {
+ if (useGPU) copyMats(amats, gmats)
+ doblock(gmats, i)
+ if (useGPU && opts.putBack >= 0) amats(opts.putBack) <-- gmats(opts.putBack)
+ }
+
+ def evalblockg(amats:Array[Mat]):FMat = {
+ if (useGPU) copyMats(amats, gmats)
+ val v = evalblock(gmats)
+ if (useGPU && opts.putBack >= 0) amats(opts.putBack) <-- gmats(opts.putBack)
+ v
+ }
+
+ def copyMats(from:Array[Mat], to:Array[Mat]) = {
+ for (i <- 0 until from.length) {
+ to(i) = to(i) <-- from(i)
+ }
+}
+}
+
+
+object Model {
+ class Options {
+ var nzPerColumn:Int = 0
+ var startBlock = 8000
+ var useGPU = true
+ var putBack = -1
+ var doubleScore = false
+ var dim = 256
+ }
+}
diff --git a/src/main/scala/BIDMach/NMFModel.scala b/src/main/scala/BIDMach/NMFModel.scala
new file mode 100755
index 00000000..115bfe7a
--- /dev/null
+++ b/src/main/scala/BIDMach/NMFModel.scala
@@ -0,0 +1,138 @@
+package BIDMach
+
+import BIDMat.{Mat,BMat,CMat,DMat,FMat,IMat,HMat,GMat,GIMat,GSMat,SMat,SDMat}
+import BIDMat.MatFunctions._
+import BIDMat.SciFunctions._
+
+
+class NMFModel(opts:NMFModel.Options = new NMFModel.Options) extends FactorModel(opts) {
+
+ var mm:Mat = null
+ var mdiag:Mat = null
+ var udiag:Mat = null
+
+ override def init(datasource:DataSource) = {
+ super.init(datasource)
+ mm = modelmats(0)
+ modelmats = new Array[Mat](2)
+ modelmats(0) = mm
+ modelmats(1) = mm.zeros(mm.nrows, mm.ncols)
+ updatemats = new Array[Mat](2)
+ updatemats(0) = mm.zeros(mm.nrows, mm.ncols)
+ updatemats(1) = mm.zeros(mm.nrows, mm.ncols)
+ udiag = mkdiag(opts.uprior*ones(opts.dim,1))
+ mdiag = mkdiag(opts.mprior*ones(opts.dim,1))
+ if (useGPU) {
+ udiag = GMat(udiag)
+ mdiag = GMat(mdiag)
+ }
+ }
+
+ override def uupdate(sdata:Mat, user:Mat) = {
+ val modeldata = mm * sdata
+ val mmu = mm *^ mm + udiag
+ for (i <- 0 until opts.uiter) {
+ val quot = modeldata / (mmu * user)
+ min(10.0f, max(0.1f, quot, quot), quot)
+ user ~ user *@ quot
+ max(opts.minuser, user, user)
+ }
+ }
+
+ override def mupdate(sdata:Mat, user:Mat):Unit = {
+ val uu = user *^ user + mdiag *@ (1.0f*size(user,2)/opts.nusers)
+ updatemats(0) ~ (user *^ sdata) *@ mm
+ updatemats(1) ~ uu * mm
+ max(updatemats(1), opts.NMFeps, updatemats(1))
+ }
+
+ override def mupdate2(sdata:Mat, user:Mat):Unit = {
+ val uu = user *^ user + mdiag *@ (1.0f*size(user,2)/opts.nusers)
+ updatemats(0) ~ user *^ sdata
+ updatemats(1) ~ uu * mm
+ }
+
+ override def evalfun(sdata:Mat, user:Mat):FMat = {
+ if (opts.doubleScore) {
+ evalfunx(sdata, user)
+ } else {
+ val modeldata = mm * sdata
+ val uu = user *^ user + mdiag *@ (1.0f*size(user,2)/opts.nusers)
+ val mmm = mm *^ mm
+
+ val ll0 = sdata.contents ddot sdata.contents
+ val ll1 = modeldata ddot user
+ val ll2 = uu ddot mmm
+ val v1 = (-ll0 + 2*ll1 - ll2)/sdata.nnz
+ val v2 = -opts.uprior*(user ddot user)/sdata.nnz
+ row(v1,v2)
+ }
+ }
+
+ def evalfunx(sdata0:Mat, user0:Mat):FMat = {
+ val sdata = SDMat(sdata0)
+ val user = DMat(user0)
+ val mmf = DMat(mm)
+ val mdiagf = DMat(mdiag)
+
+ val modeldata = mmf * sdata
+ val uu = user *^ user + mdiagf *@ (1.0f*size(user,2)/opts.nusers)
+ val mmm = mmf *^ mmf
+
+ val ll0 = sdata.contents ddot sdata.contents
+ val ll1 = modeldata ddot user
+ val ll2 = uu ddot mmm
+ val v1 = (-ll0 + 2*ll1 - ll2)/sdata.nnz
+ val v2 = -opts.uprior*(user ddot user)/sdata.nnz
+ row(v1,v2)
+ }
+}
+
+object NMFModel {
+ class Options extends FactorModel.Options {
+ var NMFeps = 1e-12
+ var uprior = 0.01f
+ var mprior = 1e-4f
+ var nusers = 100000
+ }
+
+ def mkNMFmodel(fopts:Model.Options) = {
+ new NMFModel(fopts.asInstanceOf[NMFModel.Options])
+ }
+
+ def mkUpdater(nopts:Updater.Options) = {
+ new IncNormUpdater(nopts.asInstanceOf[IncNormUpdater.Options])
+ }
+
+ def learn(mat0:Mat) = {
+ new Learner(new MatDataSource(Array(mat0:Mat)), new NMFModel(), null, new IncNormUpdater(), new Learner.Options)
+ }
+
+ def learnBatch(mat0:Mat) = {
+ new Learner(new MatDataSource(Array(mat0:Mat)), new NMFModel(), null, new BatchNormUpdater(), new Learner.Options)
+ }
+
+ def learnFPar(
+ nstart:Int=FilesDataSource.encodeDate(2012,3,1,0),
+ nend:Int=FilesDataSource.encodeDate(2012,12,1,0)
+ ) = {
+ new LearnFParModel(
+ new NMFModel.Options, mkNMFmodel _,
+ new IncNormUpdater.Options, mkUpdater _,
+ (n:Int, i:Int) => SFilesDataSource.twitterWords(nstart, nend, n, i)
+ )
+ }
+
+ def learnFParx(
+ nstart:Int=FilesDataSource.encodeDate(2012,3,1,0),
+ nend:Int=FilesDataSource.encodeDate(2012,12,1,0)
+ ) = {
+ new LearnFParModelx(
+ SFilesDataSource.twitterWords(nstart, nend),
+ new NMFModel.Options, mkNMFmodel _,
+ new IncNormUpdater.Options, mkUpdater _)
+ }
+}
+
+
+
diff --git a/src/main/scala/BIDMach/Regression.scala b/src/main/scala/BIDMach/Regression.scala
new file mode 100755
index 00000000..b74dab19
--- /dev/null
+++ b/src/main/scala/BIDMach/Regression.scala
@@ -0,0 +1,55 @@
+package BIDMach
+
+import BIDMat.{Mat,BMat,CMat,DMat,FMat,IMat,HMat,GMat,GIMat,GSMat,SMat,SDMat}
+import BIDMat.MatFunctions._
+import BIDMat.SciFunctions._
+
+abstract class RegressionModel(override val opts:RegressionModel.Options) extends Model {
+ var targmap:Mat = null
+ var targets:Mat = null
+ var mask:Mat = null
+
+ override def init(datasource:DataSource) = {
+ super.init(datasource)
+ useGPU = opts.useGPU && Mat.hasCUDA > 0
+ val data0 = mats(0)
+ val m = size(data0, 1)
+ val d = opts.targmap.nrows
+ val sdat = (sum(data0,2).t + 0.5f).asInstanceOf[FMat]
+ val sp = sdat / sum(sdat)
+ println("initial perplexity=%f" format (sp ddot ln(sp)) )
+
+ val rmat = rand(d,m)
+ rmat ~ rmat *@ sdat
+ val msum = sum(rmat, 2)
+ rmat ~ rmat / msum
+ val mm = rmat
+ modelmats = Array[Mat](1)
+ modelmats(0) = if (useGPU) GMat(mm) else mm
+ updatemats = new Array[Mat](1)
+ updatemats(0) = modelmats(0).zeros(mm.nrows, mm.ncols)
+ targets = if (useGPU) GMat(opts.targets) else opts.targets
+ targmap = if (useGPU) GMat(opts.targmap) else opts.targmap
+ mask = if (useGPU) GMat(opts.mask) else opts.mask
+ }
+
+ def mupdate(data:Mat):FMat
+
+ def doblock(gmats:Array[Mat], i:Long) = {
+ val sdata = gmats(0)
+ mupdate(sdata)
+ }
+
+ def evalblock(mats:Array[Mat]):FMat = {
+ val sdata = gmats(0)
+ mupdate(sdata)
+ }
+}
+
+object RegressionModel {
+ class Options extends Model.Options {
+ var targets:FMat = null
+ var targmap:FMat = null
+ var mask:FMat = null
+ }
+}
diff --git a/src/main/scala/BIDMach/Regularizer.scala b/src/main/scala/BIDMach/Regularizer.scala
new file mode 100755
index 00000000..105b24d3
--- /dev/null
+++ b/src/main/scala/BIDMach/Regularizer.scala
@@ -0,0 +1,38 @@
+package BIDMach
+import BIDMat.{Mat,BMat,CMat,DMat,FMat,IMat,HMat,GMat,GIMat,GSMat,SMat,SDMat}
+import BIDMat.MatFunctions._
+import BIDMat.SciFunctions._
+
+abstract class Regularizer(val opts:Regularizer.Options = new Regularizer.Options) {
+ val options = opts
+ var modelmats:Array[Mat] = null
+ var updatemats:Array[Mat] = null
+
+ def compute(step:Float)
+
+ def init(model:Model) = {
+ modelmats = model.modelmats
+ updatemats = model.updatemats
+ }
+}
+
+class L1Regularizer(override val opts:Regularizer.Options = new Regularizer.Options) extends Regularizer(opts) {
+ def compute(step:Float) = {
+ for (i <- 0 until modelmats.length) {
+ updatemats(i) ~ updatemats(i) + (sign(modelmats(i)) * (-step*options.mprior))
+ }
+ }
+}
+
+class L2Regularizer(override val opts:Regularizer.Options = new Regularizer.Options) extends Regularizer(opts) {
+ def compute(step:Float) = {
+ for (i <- 0 until modelmats.length) {
+ updatemats(i) ~ updatemats(i) + (modelmats(i) * (-options.mprior * step))
+ }
+ }
+}
+
+object Regularizer {
+ class Options {
+ var mprior:FMat = 1e-7f }
+}
diff --git a/src/main/scala/BIDMach/Sampler.scala b/src/main/scala/BIDMach/Sampler.scala
new file mode 100755
index 00000000..7cb9d3b2
--- /dev/null
+++ b/src/main/scala/BIDMach/Sampler.scala
@@ -0,0 +1,22 @@
+package BIDMach
+import BIDMat.{Mat,BMat,CMat,CSMat,DMat,FMat,GMat,GIMat,GSMat,HMat,IMat,SMat,SDMat}
+import BIDMat.MatFunctions._
+import BIDMat.SciFunctions._
+
+abstract class Sampler {
+
+ val options:Sampler.Options
+
+ def insample(pos:Int, modelnum:Int):Int = 1
+
+ def outsample(mat:Mat):Unit = {}
+
+
+}
+
+
+object Sampler {
+ class Options {
+
+ }
+}
diff --git a/src/main/scala/BIDMach/TestLearner.scala b/src/main/scala/BIDMach/TestLearner.scala
new file mode 100755
index 00000000..e081906b
--- /dev/null
+++ b/src/main/scala/BIDMach/TestLearner.scala
@@ -0,0 +1,153 @@
+package BIDMach
+import BIDMat.{Mat,BMat,CMat,DMat,FMat,IMat,HMat,GMat,GIMat,GSMat,SMat,SDMat}
+import BIDMat.MatFunctions._
+import BIDMat.SciFunctions._
+import BIDMat.Plotting._
+
+
+object TestLearner {
+/*
+ def runLDALearner(rt:SMat, rtest:SMat, ndims:Int, nthreads:Int, useGPU:Boolean):Learner = {
+
+// Mat.numThreads = 1
+ val model = new LDAmodel()
+ model.options.dim = ndims
+ model.options.uiter = 4
+ model.options.uprior = 1e-1f
+ model.options.mprior = 1e0f
+ model.options.minuser = 1e-7f
+ model.options.nzPerColumn = 400
+ model.options.useGPU = useGPU
+
+ val updater = new MultUpdater
+ updater.options.alpha = 0.3f
+// val updater = new MultUpdater(model)
+// updater.options.alpha = 0.1f
+ updater.options.initnsteps = 8000f
+
+ val learner = Learner(rt, null, rtest, null, model, null, updater)
+ learner.options.npasses = 20
+ learner.options.secprint = 100
+ learner.options.blocksize = 8000 //size(rt,2)
+ learner.options.numGPUthreads = nthreads
+ learner.run
+ learner
+ }
+
+ def runNMFLearner(rt:SMat, rtest:SMat, ndims:Int, nthreads:Int, useGPU:Boolean):Learner = {
+ val model = new NMFmodel()
+ model.options.dim = ndims
+ model.options.uiter = 4
+ model.options.uprior = 1e-4f
+ model.options.mprior = 1e2f
+ model.options.minuser = 1e-8f
+ model.options.nzPerColumn = 400
+ model.options.useGPU = useGPU
+
+ val updater = new MultUpdater
+ updater.options.alpha = 0.1f
+// val updater = new MultUpdater(model)
+// updater.options.alpha = 0.1f
+ updater.options.initnsteps = 8000f
+
+ val learner = Learner(rt, null, rtest, null, model, null, updater)
+ learner.options.npasses = 10
+ learner.options.secprint = 100
+ learner.options.blocksize = 16000/nthreads //size(rt,2)//40000 //
+ learner.options.numGPUthreads = nthreads
+ learner.run
+ learner
+ }
+
+ def runLogLearner(rt:SMat, st:FMat, rtest:SMat, stest:FMat):Learner = {
+ val model = new LogisticModel()
+ model.options.useGPU = false
+
+ val regularizer = new L1Regularizer(model)
+ regularizer.options.mprior = 1e-7f
+
+ val updater = new ADAGradUpdater
+ updater.options.alpha = 300f
+ updater.options.gradwindow = 1e6f
+
+ val learner = Learner(rt, st > 4, rtest, stest > 4, model, regularizer, updater)
+ learner.options.npasses = 20
+ learner.run
+ learner
+ }
+
+ def runLinLearner(rt:SMat, st:FMat, rtest:SMat, stest:FMat):Learner = {
+ val model = new LinearRegModel() {
+ override def regfn(targ:Mat, pred:Mat, lls:Mat, gradw:Mat):Unit = linearMap1(targ, pred, lls, gradw)
+ }
+ model.options.nzPerColumn = 400
+ model.options.transpose = false
+ model.options.useGPU = false
+
+ val regularizer = new L1Regularizer(model)
+ regularizer.options.mprior = 1e-6f
+
+ val updater = new ADAGradUpdater { override def update(step:Int):Unit = update1(step) }
+ // regularizer.options.beta = 1e-7f
+ updater.options.alpha = 200f
+ updater.options.gradwindow = 1e6f
+
+ val learner = Learner(rt, st, rtest, stest, model, regularizer, updater)
+ learner.options.npasses = 10
+ learner.options.secprint = 100
+ learner.run
+ learner
+ }
+
+ def runtest(dirname:String, ntest:Int, ndims:Int, nthreads:Int, useGPU:Boolean):Learner = {
+ tic
+ val revtrain:SMat = load(dirname+"xpart1.mat", "revtrain")
+ val revtest:SMat = load(dirname+"xpart1.mat", "revtest")
+ val t1 = toc; tic
+ val rt = revtrain(0->4000,0->(8000*(size(revtrain,2)/8000)))
+ val rtest = revtest(0->4000,0->(8000*(size(revtest,2)/8000)))
+ val scrtrain:IMat = load(dirname+"xpart1.mat", "scrtrain")
+ val scrtest:IMat = load(dirname+"xpart1.mat", "scrtest")
+ val st = FMat(scrtrain).t
+ val stest = (FMat(scrtest).t)(?,0->(8000*(size(revtest,2)/8000)))
+ val t2 = toc
+ println("Reading time=%3.2f+%3.2f seconds" format (t1,t2))
+ val ntargs = ndims
+ val stt = zeros(ntargs, size(st,2))
+ val sttest = zeros(ntargs, size(stest,2))
+ for (i<-0 until size(stt,1)) {stt(i,?) = st; sttest(i,?) = stest}
+ flip
+ val learner:Learner = ntest match {
+ case 1 => runLinLearner(rt, stt, rtest, sttest)
+ case 2 => runLogLearner(rt, stt, rtest, sttest)
+ case 3 => runNMFLearner(rt , rtest, ndims, nthreads, useGPU)
+ case 4 => runLDALearner(rt , rtest, ndims, nthreads, useGPU)
+ }
+ val (ff, tt) = gflop
+ println("Time=%5.3f, gflops=%3.2f" format (tt, ff))
+ val xvals = irow(1->(learner.tscores.size+1))
+ val tscores = learner.tscores
+ val tscorex = learner.tscorex
+ val tsteps = learner.tsteps
+ val timeplot = semilogy(xvals, drow(tscores), xvals, drow(tscorex))
+ val stepplot = semilogy(drow(tsteps), drow(learner.tscores), drow(tsteps), drow(tscorex))
+// val userhist = hist(log10(FMat(targetmat)(?)),100)
+ timeplot.setTitle("Neg. log likelihood vs time in seconds")
+ stepplot.setTitle("Neg. log likelihood vs number of samples")
+ val modelhist = hist(log10(FMat(learner.model.modelmat)(?)),100)
+// val userhist = hist(log10(FMat(learner.targetmat)(?)),100)
+ learner
+ }
+
+
+ def main(args: Array[String]): Unit = {
+ val dirname = args(0)
+ val ntest = args(1).toInt
+ val ndims = args(2).toInt
+ val nthreads = args(3).toInt
+ val useGPU = args(4).toBoolean
+
+ Mat.checkCUDA
+ runtest(dirname, ntest, ndims, nthreads, useGPU)
+ } */
+}
diff --git a/src/main/scala/BIDMach/Updater.scala b/src/main/scala/BIDMach/Updater.scala
new file mode 100755
index 00000000..1e4632ae
--- /dev/null
+++ b/src/main/scala/BIDMach/Updater.scala
@@ -0,0 +1,391 @@
+package BIDMach
+
+import BIDMat.{Mat,BMat,CMat,DMat,FMat,IMat,HMat,GMat,GIMat,GSMat,SMat,SDMat}
+import BIDMat.MatFunctions._
+import BIDMat.SciFunctions._
+
+
+abstract class Updater(val opts:Updater.Options = new Updater.Options) {
+ var model:Model = null
+
+ def init(model0:Model) = {
+ model = model0
+ }
+
+ def update(step:Long):Unit
+ def updateM():Unit = {}
+ def clear():Unit = {}
+}
+
+
+class IncNormUpdater(override val opts:IncNormUpdater.Options = new IncNormUpdater.Options) extends Updater(opts) {
+
+ var firstStep = 0f
+ var rm:Mat = null
+ var restart:Mat = null
+ var started:Int = 0
+
+ override def init(model0:Model) = {
+ super.init(model0)
+ val modelmats = model0.modelmats
+ val updatemats = model0.updatemats
+ restart = modelmats(0) + 1f
+ rm = model0.modelmats(0).zeros(1,1)
+ firstStep = 0f
+ }
+
+ def update(step:Long) = {
+ val modelmats = model.modelmats
+ val updatemats = model.updatemats
+ val mm = modelmats(0)
+ val um = updatemats(0)
+ val rr = if (step == 0) 0.99f else {
+ if (firstStep == 0f) {
+ firstStep = step
+ 0.99f
+ } else {
+ math.pow(firstStep / step, opts.power).toFloat
+ }
+ }
+ if (modelmats.length > 1) {
+ val ms = modelmats(1)
+ val ums = updatemats(1)
+// println("ums0 %g %g %g" format (rr, mini(mini(ums,1),2).dv, maxi(maxi(ums,1),2).dv))
+ ums ~ ums *@ rm.set(rr)
+// println("ums1 %g %g %g" format (rr, mini(mini(ums,1),2).dv, maxi(maxi(ums,1),2).dv))
+ ms ~ ms *@ rm.set(1-rr)
+// println("ums2 %g %g %g" format (rr, mini(mini(ums,1),2).dv, maxi(maxi(ums,1),2).dv))
+ ms ~ ms + ums
+// println("ums3 %g %g %g" format (rr, mini(mini(ums,1),2).dv, maxi(maxi(ums,1),2).dv))
+ um ~ um / ms
+// println("um %g %g" format (mini(mini(um,1),2).dv, maxi(maxi(um,1),2).dv))
+ }
+ um ~ um *@ rm.set(rr)
+ mm ~ mm *@ rm.set(1-rr)
+ mm ~ mm + um
+ mm ~ mm / sum(mm,2)
+ if (opts.warmup > 0) {
+ if (started == 0 && step > opts.warmup) {
+ restart <-- mm
+ started = 1
+ }
+ if (started == 1 && step > 2*opts.warmup) {
+ mm ~ mm - restart
+ max(mm, 0f, mm)
+ mm ~ mm / sum(mm,2)
+ started = 2
+ }
+ }
+ }
+
+ override def clear() = {
+ firstStep = 0f
+ }
+}
+
+class BatchNormUpdater(override val opts:BatchNormUpdater.Options = new BatchNormUpdater.Options) extends Updater {
+ var accumulators:Array[Mat] = null
+
+ override def init(model0:Model) = {
+ super.init(model0)
+ val modelmats = model.modelmats
+ val updatemats = model.updatemats
+ accumulators = new Array[Mat](updatemats.length)
+ for (i <- 0 until accumulators.length) {
+ accumulators(i) = updatemats(i).zeros(updatemats(i).nrows, updatemats(i).ncols)
+ }
+ }
+
+ def update(step:Long) = {
+ val updatemats = model.updatemats
+ for (i <- 0 until accumulators.length) {
+ accumulators(i) ~ accumulators(i) + updatemats(i)
+ }
+ }
+
+ override def clear() = {
+ for (i <- 0 until accumulators.length) {
+ accumulators(i).clear
+ }
+ }
+
+ override def updateM():Unit = {
+ val mm = model.modelmats(0)
+ mm ~ accumulators(0) / accumulators(1)
+ mm ~ mm / sum(mm,2)
+ clear
+ }
+}
+
+
+class IncMultUpdater(override val opts:IncMultUpdater.Options = new IncMultUpdater.Options) extends Updater {
+
+ var firstStep = 0f
+ var rm:Mat = null
+
+ override def init(model0:Model) = {
+ super.init(model0)
+ rm = model0.modelmats(0).zeros(1,1)
+ }
+
+ def update(step:Long) = {
+ val modelmats = model.modelmats
+ val updatemats = model.updatemats
+ val mm = modelmats(0)
+ val ms = modelmats(1)
+ val um = updatemats(0)
+ val ums = updatemats(1)
+ val rr = if (step == 0) 1f else {
+ if (firstStep == 0f) {
+ firstStep = step
+ 1f
+ } else {
+ math.pow(firstStep / step, opts.power).toFloat
+ }
+ }
+// println("rr=%g, %g %g" format (rr, mini(mini(um,1),2).dv, maxi(maxi(um,1),2).dv))
+ um ~ um *@ rm.set(rr)
+// println("rr=%g, %g %g" format (rr, mini(mini(um,1),2).dv, maxi(maxi(um,1),2).dv))
+ ln(mm, mm)
+// println("mm=%g %g" format (mini(mini(mm,1),2).dv, maxi(maxi(mm,1),2).dv))
+ mm ~ mm *@ rm.set(1-rr)
+// println("mm=%g %g" format (mini(mini(mm,1),2).dv, maxi(maxi(mm,1),2).dv))
+ mm ~ mm + um
+// println("mm=%g %g" format (mini(mini(mm,1),2).dv, maxi(maxi(mm,1),2).dv))
+ exp(mm, mm)
+// println("mm=%g %g" format (mini(mini(mm,1),2).dv, maxi(maxi(mm,1),2).dv))
+ mm ~ mm / sum(mm,2)
+ }
+
+ override def clear() = {
+ firstStep = 0f
+ }
+}
+
+class TelescopingUpdater(override val opts:TelescopingUpdater.Options = new TelescopingUpdater.Options) extends Updater {
+ var accumulators:Array[Mat] = null
+ var firstStep = 0L
+ var nextStep = 10L
+ var nextCount = 0L
+ var rm:Mat = null
+
+ override def init(model0:Model) = {
+ super.init(model0)
+ val modelmats = model0.modelmats
+ val updatemats = model0.updatemats
+ rm = model0.modelmats(0).zeros(1,1)
+ accumulators = new Array[Mat](updatemats.length)
+ for (i <- 0 until updatemats.length) {
+ accumulators(i) = updatemats(i).zeros(updatemats(i).nrows, updatemats(i).ncols)
+ }
+ firstStep = 0L
+ nextStep = 10L
+ nextCount = 0L
+ }
+
+ def update(step:Long) = {
+ if (firstStep == 0 && step > 0) {
+ firstStep = step
+ }
+ val updatemats = model.updatemats
+ for (i <- 0 until updatemats.length) {
+ accumulators(i) ~ accumulators(i) + updatemats(i)
+ }
+ if (step >= nextCount) {
+ model.modelmats(0) ~ accumulators(0) / accumulators(1)
+ nextStep = (nextStep * opts.factor).toLong
+ nextCount = step + nextStep
+ }
+ }
+
+ override def clear() = {
+ for (i <- 0 until accumulators.length) {
+ accumulators(i).clear
+ }
+ }
+}
+
+
+class GradUpdater(override val opts:GradUpdater.Options = new GradUpdater.Options) extends Updater {
+
+ var firstStep = 0f
+ var modelmat:Mat = null
+ var updatemat:Mat = null
+ var sumSq:Mat = null
+ var stepn:Mat = null
+ var mask:Mat = null
+ var ve:Mat = null
+ var te:Mat = null
+ var alpha:Mat = null
+
+ override def init(model0:Model) = {
+ model = model0
+ modelmat = model.modelmats(0)
+ updatemat = model.updatemats(0)
+ mask = opts.mask
+ stepn = modelmat.zeros(1,1)
+ te = modelmat.zeros(opts.timeExponent.nrows, opts.timeExponent.ncols)
+ alpha = modelmat.zeros(opts.alpha.nrows, opts.alpha.ncols)
+ te <-- opts.timeExponent
+ alpha <-- opts.alpha
+ }
+
+ def update(step:Long):Unit = {
+ val nsteps = if (step == 0) 1f else {
+ if (firstStep == 0f) {
+ firstStep = step
+ 1f
+ } else {
+ step / firstStep
+ }
+ }
+ stepn.set(1f/nsteps)
+ if (opts.waitsteps < nsteps) {
+ val tmp = updatemat *@ (alpha *@ (stepn ^ te))
+ modelmat ~ modelmat + tmp
+ if (mask != null) modelmat ~ modelmat *@ mask
+ }
+ }
+}
+
+
+class ADAGradUpdater(override val opts:ADAGradUpdater.Options = new ADAGradUpdater.Options) extends Updater {
+
+ var firstStep = 0f
+ var modelmat:Mat = null
+ var updatemat:Mat = null
+ var sumSq:Mat = null
+ var stepn:Mat = null
+ var mask:Mat = null
+ var ve:Mat = null
+ var te:Mat = null
+ var alpha:Mat = null
+
+ override def init(model0:Model) = {
+ model = model0
+ modelmat = model.modelmats(0)
+ updatemat = model.updatemats(0)
+ mask = opts.mask
+ if (sumSq.asInstanceOf[AnyRef] == null) {
+ sumSq = modelmat.ones(size(modelmat,1), size(modelmat,2)) *@ opts.initsumsq
+ } else {
+ sumSq.set(opts.initsumsq)
+ }
+ stepn = modelmat.zeros(1,1)
+ ve = modelmat.zeros(opts.vecExponent.nrows, opts.vecExponent.ncols)
+ te = modelmat.zeros(opts.timeExponent.nrows, opts.timeExponent.ncols)
+ alpha = modelmat.zeros(opts.alpha.nrows, opts.alpha.ncols)
+ ve <-- opts.vecExponent
+ te <-- opts.timeExponent
+ alpha <-- opts.alpha
+ }
+
+ def update2(step:Long):Unit = {
+ val nsteps = if (step == 0) 1f else {
+ if (firstStep == 0f) {
+ firstStep = step
+ 1f
+ } else {
+ step / firstStep
+ }
+ }
+ stepn.set(nsteps)
+ val nw = 1f / stepn
+ val newsquares = updatemat *@ updatemat
+ newsquares ~ newsquares *@ nw
+ sumSq ~ sumSq *@ (1f - nw)
+ sumSq ~ sumSq + newsquares
+ if (opts.waitsteps < nsteps) {
+ val tmp = sumSq ^ ve
+ tmp ~ tmp *@ (stepn ^ te)
+ tmp ~ tmp + opts.epsilon
+ modelmat ~ modelmat + ((updatemat / tmp) *@ alpha)
+ if (mask != null) modelmat ~ modelmat *@ mask
+ }
+ }
+
+ def update(step:Long):Unit = {
+ val nsteps = if (step == 0) 1f else {
+ if (firstStep == 0f) {
+ firstStep = step
+ 1f
+ } else {
+ step / firstStep
+ }
+ }
+ stepn.set(nsteps)
+ val nw = 1f / stepn
+ val newsquares = updatemat *@ updatemat
+ newsquares ~ newsquares *@ nw
+ sumSq ~ sumSq *@ (1f - nw)
+ sumSq ~ sumSq + newsquares
+ if (opts.waitsteps < nsteps) {
+ val tmp = sumSq ^ ve
+ tmp ~ tmp *@ (stepn ^ te)
+ tmp ~ tmp + opts.epsilon
+ tmp ~ updatemat / tmp
+ tmp ~ tmp *@ alpha
+ modelmat ~ modelmat + tmp
+ if (mask != null) modelmat ~ modelmat *@ mask
+ }
+ }
+}
+
+
+
+object IncNormUpdater {
+ class Options extends Updater.Options {
+ var warmup = 0L
+ var power = 0.9f
+ }
+}
+
+object IncMultUpdater {
+ class Options extends Updater.Options {
+ var warmup = 0L
+ var power = 0.9f
+ }
+}
+
+object BatchNormUpdater {
+ class Options extends Updater.Options {
+
+ }
+}
+
+object BatchMultUpdater {
+ class Options extends Updater.Options {
+ var eps = 1e-12
+
+ }
+}
+
+object TelescopingUpdater {
+ class Options extends Updater.Options {
+ val factor = 1.5f
+ }
+}
+
+object GradUpdater {
+ class Options extends Updater.Options {
+ var alpha:FMat = 1f
+ var timeExponent:FMat = 0.5f
+ var waitsteps = 2
+ var mask:FMat = null
+ }
+}
+
+
+object ADAGradUpdater {
+ class Options extends GradUpdater.Options {
+ var vecExponent:FMat = 0.5f
+ var epsilon = 1e-15f
+ var initsumsq = 1e-8f
+ }
+}
+
+object Updater {
+ class Options {
+
+ }
+}