diff --git a/.classpath b/.classpath new file mode 100755 index 00000000..cf17507c --- /dev/null +++ b/.classpath @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + diff --git a/.project b/.project new file mode 100755 index 00000000..4df1a6f3 --- /dev/null +++ b/.project @@ -0,0 +1,18 @@ + + + BIDMach + + + + + + org.scala-ide.sdt.core.scalabuilder + + + + + + org.scala-ide.sdt.core.scalanature + org.eclipse.jdt.core.javanature + + diff --git a/BIDMach.jar b/BIDMach.jar new file mode 100755 index 00000000..0cf5e46f Binary files /dev/null and b/BIDMach.jar differ diff --git a/Copyright.txt b/Copyright.txt new file mode 100755 index 00000000..21326596 --- /dev/null +++ b/Copyright.txt @@ -0,0 +1,25 @@ +Copyright (c) 2012, Regents of the University of California +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + diff --git a/bidmach b/bidmach new file mode 100755 index 00000000..d0948549 --- /dev/null +++ b/bidmach @@ -0,0 +1,37 @@ +#!/bin/bash +export JAVA_OPTS="-Xmx12G -Xms128M" # Set as much memory as possible +BIDMACH_ROOT="${BASH_SOURCE[0]}" +if [ ! `uname` = "Darwin" ]; then + BIDMACH_ROOT=`readlink -f "${BIDMACH_ROOT}"` +else + BIDMACH_ROOT=`readlink "${BIDMACH_ROOT}"` +fi +BIDMACH_ROOT=`dirname "$BIDMACH_ROOT"` +BIDMACH_ROOT="$( echo ${BIDMACH_ROOT} | sed s+/cygdrive/c+c:+ )" +BIDMAT_ROOT="${BIDMACH_ROOT}/../BIDMat" # Change if needed +# export JAVA_HOME="" # Set here if not set in environment +# Fix these if needed +JCUDA_VERSION="0.5.0" +JCUDA_LIBDIR="${BIDMAT_ROOT}/lib" +BIDLIB="${BIDMAT_ROOT}/lib" +LIBDIR=${BIDMACH_ROOT}/lib +if [ `uname` = "Darwin" ]; then + export DYLD_LIBRARY_PATH="${BIDMAT_ROOT}/lib:/usr/local/cuda/lib:${LD_LIBRARY_PATH}" +else + export LD_LIBRARY_PATH="${BIDMAT_ROOT}/lib:/usr/local/cuda/lib64:${LD_LIBRARY_PATH}" +fi + +BIDMAT_LIBS="${BIDMAT_ROOT}/BIDMat.jar;${BIDLIB}/ptplot.jar;${BIDLIB}/ptplotapplication.jar;${BIDLIB}/jhdf5.jar;${BIDLIB}/commons-math3-3.1.1.jar;${BIDLIB}/lz4-1.1.2.jar" +JCUDA_LIBS="${JCUDA_LIBDIR}/jcuda-${JCUDA_VERSION}.jar;${JCUDA_LIBDIR}/jcublas-${JCUDA_VERSION}.jar;${JCUDA_LIBDIR}/jcufft-${JCUDA_VERSION}.jar;${JCUDA_LIBDIR}/jcurand-${JCUDA_VERSION}.jar;${JCUDA_LIBDIR}/jcusparse-${JCUDA_VERSION}.jar" + +export ALL_LIBS="${BIDMACH_ROOT}/BIDMach.jar;${BIDMAT_LIBS};${JCUDA_LIBS};${JAVA_HOME}/lib/tools.jar" + +if [ ! "$OS" = "Windows_NT" ]; then + export ALL_LIBS=`echo "${ALL_LIBS}" | sed 's/;/:/g'` +else + NEWPATH=`${BIDMAT_ROOT}/shortpath.bat "${CUDA_BIN_PATH}"` + NEWPATH=`echo $NEWPATH | sed 's_\\\\_/_g'` + JAVA_OPTS="-Djava.library.path=${BIDMAT_ROOT}/lib;${NEWPATH} "$JAVA_OPTS +fi + +scala -nobootcp -cp "${ALL_LIBS}" -Yrepl-sync -i ${LIBDIR}/bidmach_init.scala \ No newline at end of file diff --git a/bidmach.cmd b/bidmach.cmd new file mode 100755 index 00000000..d904faf3 --- /dev/null +++ b/bidmach.cmd @@ -0,0 +1,20 @@ +@ECHO OFF +:: Set JAVA_HOME here if not set in environment +:: SET JAVA_HOME= +:: Set as much memory as possible +(SET JAVA_OPTS=-Xmx12G -Xms128M) +:: Fix these if needed +SET JCUDA_VERSION=0.5.0 +SET BIDLIB="%CD%\..\BIDMat\lib" +SET LIBDIR="%CD%\lib" +SET JCUDA_LIBDIR=%BIDLIB% +SET PATH=%BIDLIB%;%PATH% + +SET BIDMACH_LIBS=%BIDLIB%\..\BIDMat.jar;%CD%\BIDMach.jar;%BIDLIB%\ptplot.jar;%BIDLIB%\ptplotapplication.jar;%BIDLIB%\jhdf5.jar;%BIDLIB%\commons-math3-3.1.1.jar;%BIDLIB%\lz4-1.1.2.jar + +SET JCUDA_LIBS=%JCUDA_LIBDIR%\jcuda-%JCUDA_VERSION%.jar;%JCUDA_LIBDIR%\jcublas-%JCUDA_VERSION%.jar;%JCUDA_LIBDIR%\jcufft-%JCUDA_VERSION%.jar;%JCUDA_LIBDIR%\jcurand-%JCUDA_VERSION%.jar;%JCUDA_LIBDIR%\jcusparse-%JCUDA_VERSION%.jar + +SET ALL_LIBS=%BIDMACH_LIBS%;%JCUDA_LIBS%;%JAVA_HOME%\lib\tools.jar +:: echo %ALL_LIBS% + +scala -nobootcp -cp "%ALL_LIBS%" -Yrepl-sync -i %LIBDIR%\bidmach_init.scala \ No newline at end of file diff --git a/build.sbt b/build.sbt new file mode 100755 index 00000000..9aecc3b1 --- /dev/null +++ b/build.sbt @@ -0,0 +1,40 @@ + +name := "BIDMach" + +version := "0.1.0" + +organization := "edu.berkeley.bid" + +scalaVersion := "2.9.2" + +resolvers ++= Seq( + "Scala Tools Snapshots" at "http://scala-tools.org/repo-snapshots/", + "Scala Mirror" at "https://oss.sonatype.org/content/repositories/releases/" +) + +libraryDependencies <<= (scalaVersion, libraryDependencies) { (sv, deps) => + deps :+ ("org.scala-lang" % "scala-compiler" % sv) +} + +libraryDependencies += "org.scala-lang" % "jline" % "2.9.2" + +libraryDependencies += "org.scalatest" %% "scalatest" % "1.8" % "test" + +libraryDependencies += "org.scalacheck" %% "scalacheck" % "1.9" % "test" + +libraryDependencies += "junit" % "junit" % "4.5" % "test" + +credentials += Credentials(Path.userHome / ".ivy2" / ".credentials") + +javacOptions ++= Seq("-source", "1.5", "-target", "1.5") + +scalacOptions ++= Seq("-deprecation","-target:jvm-1.5") + +initialCommands := scala.io.Source.fromFile("lib/bidmach_init.scala").getLines.mkString("\n") + +javaOptions += "-Xmx12g" + +//seq(ProguardPlugin.proguardSettings :_*) + + + diff --git a/lib/bidmach_init.scala b/lib/bidmach_init.scala new file mode 100755 index 00000000..d11e2758 --- /dev/null +++ b/lib/bidmach_init.scala @@ -0,0 +1,10 @@ +import BIDMat.{BMat,CMat,CSMat,DMat,Dict,IDict,FMat,GMat,GIMat,GSMat,HMat,IMat,Mat,SMat,SDMat} +import BIDMat.MatFunctions._ +import BIDMat.SciFunctions._ +import BIDMat.Solvers._ +import BIDMat.Plotting._ +import BIDMach.{MatDataSource,FilesDataSource,SFilesDataSource,Learner,LDAModel,NMFModel} + +Mat.checkMKL +Mat.checkCUDA + diff --git a/project/plugins.sbt b/project/plugins.sbt new file mode 100755 index 00000000..bf5cb709 --- /dev/null +++ b/project/plugins.sbt @@ -0,0 +1,7 @@ + +libraryDependencies <+= sbtVersion(v => "com.github.siasia" %% "xsbt-proguard-plugin" % (v+"-0.1.1")) + +resolvers += "Proguard plugin repo" at "http://siasia.github.com/maven2" + + + diff --git a/src/main/scala/BIDMach/Clustering.scala b/src/main/scala/BIDMach/Clustering.scala new file mode 100755 index 00000000..4e7e6385 --- /dev/null +++ b/src/main/scala/BIDMach/Clustering.scala @@ -0,0 +1,372 @@ +package BIDMach +import BIDMat.{Mat,BMat,CMat,DMat,FMat,IMat,HMat,GMat,GIMat,GSMat,SMat,SDMat} +import BIDMat.MatFunctions._ +import BIDMat.SciFunctions._ + + +class PAMmodel(opts:PAMmodel.Options = new PAMmodel.Options) { + + + var a:FMat = null + var nfeats = 0 + var nsamps = 0 + var ncenters = 0 + var ntrys = 0 + val options = opts + var maxdepth = 0 + var nspills = 0 + var bestc:IMat = null + var imin:IMat = null + var vdists:DMat = null + var sil:DMat = null + var mss = 0.0 + + var ncache = 0 + + + def dists(x:FMat) = options.metric.dists(x:FMat) + def dists(x:FMat, y:FMat) = options.metric.dists(x:FMat, y:FMat) + + def init(a0:FMat) = { + + a = a0 + nfeats = size(a0,2) + nsamps = size(a0,1) + ncenters = options.ncenters + ntrys = options.ntrys + vdists = dzeros(nsamps,1) + imin = izeros(nsamps,1) + sil = dzeros(nsamps,1) + + ncache = min(nsamps,options.cbal*nsamps/ncenters)(0,0) + + } + + + // Silhouette Score + def silhouetteScore(ds:FMat, iss:IMat, isamps:IMat, icenters:IMat, lab:IMat):DMat = { + + var aa = zeros(nsamps,1) + var bb = zeros(nsamps,1) + + var silhouette = zeros(nsamps,1) + + var dimclu = zeros(length(icenters),1) // Size of each cluster + for( i <- 0 to nsamps-1 ) { dimclu(lab(i))+=1 } + + //val ncache = size(iss,1) + val centmap = accum(icenters, icol(1 to length(icenters)), size(ds,2), 1) + var i = 0 + while (i < length(isamps)) { + + val ii = isamps(i) + val labi = lab(i) + + var maxsize = dimclu(labi) + 1 + + var first = true + var j = 0 + var count = 0 + + while (j < ncache && count < maxsize) { + + + if(lab(j)==labi){ + aa(i)+=ds(j, ii) + }else { // pick the second closest center + + if(first){ + + val labj = lab(j) + maxsize += dimclu(labj) - 1 + first = false + + + } + bb(i)+=ds(j, ii) + } + + j += 1 + } + silhouette(i) = (bb(i) - aa(i)) / max(aa(i),bb(i)) + i += 1 + } + + silhouette + + + } + + + def mindists(ds:FMat, iss:IMat, isamps:IMat, icenters:IMat, vmin:DMat, imin:IMat) = { + + val centmap = accum(icenters, icol(1 to length(icenters)), size(ds,2), 1) + var i = 0 + var ispills = izeros(1,nsamps) + + var spills = 0 + + while (i < length(isamps)) { + val ii = isamps(i) + var continue = true + var j = 0 + while (j < ncache && continue) { + + if (centmap(iss(j, ii)) > 0) { + imin(ii) = centmap(iss(j, ii)) - 1 + vmin(ii) = ds(j, ii) + continue = false + } + j += 1 + } + maxdepth = math.max(maxdepth, j) + + + if (j >= ncache & continue) + { + + ispills(spills) = i + spills += 1 + nspills += 1 + + imin(ii) = -1 + vmin(ii) = -1 + + } + Mat.nflops += 4*j + i += 1 + } + + if(spills>0) + { + + ispills = ispills(0,0 until spills) + + + val dspill = dists(a(icenters(?,0),?),a(ispills,?)) + + var (ddd,iii) = maxi2(dspill,1) + + imin(ispills) = centmap(iii) + for (i <- 0 until spills){ vmin(ispills(i),0) = ddd(0,i)} + + } + + } + + def mindists(ds:FMat, iss:IMat, isamps:IMat, icenters:IMat):(DMat, IMat) = { + val vmin = dzeros(nsamps,1) + val imin = izeros(nsamps,1) + mindists(ds, iss, isamps, icenters, vmin, imin) + (vmin, imin) + } + + + def pointdiffs(ds:FMat, iss:IMat, vd:DMat):DMat = { + val deltas = dzeros(nsamps,1) // Array to hold improvements in distance over vd + + var ispills = izeros(1,nsamps) + var spills = 0 + + var i = 0 + while (i < nsamps) { // Calculate improvements over vd for new candidate centers + var j = 0 + while (j < ncache && ds(j,i) < vd(i)) { // using sorted order of ds + deltas(iss(j,i)) += ds(j,i) - vd(i) + j += 1 + } + maxdepth = math.max(maxdepth, j) + if (j >= ncache){ + if(ds(j-1,i) < vd(i)) + { + ispills(spills) = i + nspills += 1 + spills += 1 + } + } + + Mat.nflops += 4 * j + i += 1 + } + + if(spills > 0) + { + + ispills = ispills(?,0 until spills) + val threshold = ds(ncache-1,ispills) + + val dspill = dists(a)(?,ispills) + + for (i <- 0 until spills){ // Calculate improvements over vd for new candidate centers + + var j = 0 + val ii = ispills(0,i) + + while (j < nsamps) { + if(dspill(j,i) < vd(ii) & dspill(j,i) > threshold(i)) { + deltas(j) += dspill(j,i) - vd(ii) + } + j += 1 + } + } + + } + + deltas + } + + + def sortgen(dd:FMat):(FMat,IMat) = { // Sorts the COLUMNS in ascending order... + + + if (Mat.hasCUDA <= 0) { // until GPUsort fixed + var (smat, imat) = sort2(dd,1) + + + if(ncache < nsamps){ + smat = smat(0 until ncache,?) + imat = imat(0 until ncache,?) + } + + (smat, imat) + + } else { + + var smat = dd.copy + var imat = icol(0->nsamps)*iones(1,nsamps) + + GMat.sortGPU(smat, imat) + + if(ncache < nsamps){ + smat = smat(0 until ncache,?) + imat = imat(0 until ncache,?) + } + + (smat, imat) + + } + + } + + def run = { + println("PAM clustering %d points with %d features into %d centers" format (nsamps, nfeats, ncenters)) + flip + val dd = dists(a) + val ft1 = gflop + println("Distances in %f seconds, %f gflops" format (ft1._2,ft1._1)) + flip + val (ds, iss) = sortgen(dd) + + + + + Mat.nflops += math.round(math.log(size(ds,1))/math.log(2.0))*size(ds,1)*size(ds,2) + val ft2 = gflop + println("Sort in %f seconds, %f gcomps" format (ft2._2,ft2._1)) + var bestv:DMat = null + var besti:IMat = null + var bestvd = Double.MaxValue + flip + var itry = 0 + + while(itry < ntrys) { + + println("Try %d" format itry) + val rr = rand(nsamps,1) // Get a random permutation for the centers + val (rs,irs) = sort2(rr,1) + val icenters = irs(0->ncenters,0) // Pick centers from the permutation + val ics = icol(0->nsamps) + mindists(ds, iss, ics, icenters, vdists, imin) // Get min distances from points to centers, and best center ids + println(" pass=0, mean dist=%f" format mean(vdists,1).v) + val vtmp = vdists.copy + val itmp = imin.copy + var nchanged = 1 + var ipass = 0 + var totchanged = 0 + while (nchanged > 0 && ipass < options.maxpasses) { // Keep making passes until no improvements + ipass += 1 + nchanged = 0 + var ipc = 0 + while (ipc < ncenters) { // Try to improve this center (ipc) + vtmp <-- vdists // Copy distances + val ifix = find(imin == ipc) // Find points in cluster with this center + val tcents = icenters((0->ipc) \ ((ipc+1)->ncenters),0) // List of centers minus the current one + mindists(ds, iss, ifix, tcents, vtmp, itmp) // vtmp holds distances to centers minus the current center + val deltas = pointdiffs(ds, iss, vtmp) // deltas holds improvements for each potential center over vtmp + val (vs,is) = mini2(deltas) // Find best new center + if (vs.v + sum(vtmp).v < sum(vdists).v && is.v != icenters(ipc,0)) { // Is the new center better than the old (and not equal to it)? + icenters(ipc) = is.v // If yes, update the center list + mindists(ds, iss, ics, icenters, vdists, imin) // Compute new distances and centers + nchanged += 1 + if (options.verb) println(" pass=%d, ipc=%d, mean dist=%f, nchanged=%d" format (ipass, ipc, mean(vdists,1).v, nchanged)) + } + ipc += 1 + } + println(" pass=%d, mean dist=%f, nchanged=%d, nspills=%d" format (ipass, mean(vdists,1).v, nchanged, nspills)) + totchanged += nchanged + } + val mv = mean(vdists).v + if (mv < bestvd) { + bestc = icenters + bestv = vdists + besti = imin + bestvd = mv + } + itry += 1 + } + val t3=gflop + val vdists2 = mini(dd(?,bestc),2) + println("Optimum in %f secs, %f gflops, mean dist=%f, verify=%f\n maxdepth=%d, nspills=%d, ncache=%d\nTotal time %f seconds" format + (t3._2, t3._1, bestvd, mean(DMat(vdists2),1).v, maxdepth, nspills, ncache, t3._2+ft2._2+ft1._2)) + + val ics = icol(0->nsamps) + flip + + sil= silhouetteScore(ds, iss, ics, bestc,imin) + val t4=gflop + mss = mean(sil,2)(0,0) + + println("Mean Silhouette Score (MSS) %f \n Elapsed time %f secs" format(mss, t4._1 )) + + } + +} + +object PAMmodel { + + class Options { + var ncenters = 1000 + var maxpasses = 10 + var ntrys = 1 + var metric:Distance = null + var verb = false + var cbal = 10 + + } + + def runit(nsamps:Int, nfeats:Int, ncenters:Int,metric:String) = { + println("Generating dataset") + val c = rand(ncenters, nfeats) + val a = rand(nsamps, nfeats)*0.3f + for (i <- 0 until nsamps by ncenters) {val il = math.min(i+ncenters, nsamps); a(i->il,?) += c(0->(il-i),?)} + val cc = new PAMmodel + cc.options.ncenters = ncenters + cc.options.metric = metric match { + case "euclid" => new euclidDistance + case "cosangle" => new cosangleDistance + case "corr" => new correlationDistance + } + cc.init(a) + cc.run + } + + def main(args:Array[String]) = { + Mat.checkCUDA + val nsamps= args(0).toInt + val nfeats = args(1).toInt + val ncenters = args(2).toInt + val metric = args(3) + runit(nsamps, nfeats, ncenters, metric) + } +} + diff --git a/src/main/scala/BIDMach/Copyright.txt b/src/main/scala/BIDMach/Copyright.txt new file mode 100755 index 00000000..21326596 --- /dev/null +++ b/src/main/scala/BIDMach/Copyright.txt @@ -0,0 +1,25 @@ +Copyright (c) 2012, Regents of the University of California +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + diff --git a/src/main/scala/BIDMach/DataSource.scala b/src/main/scala/BIDMach/DataSource.scala new file mode 100755 index 00000000..7561b20f --- /dev/null +++ b/src/main/scala/BIDMach/DataSource.scala @@ -0,0 +1,738 @@ +package BIDMach +import BIDMat.{Mat,BMat,CMat,CSMat,DMat,FMat,IMat,HMat,GMat,GIMat,GSMat,SMat,SDMat} +import BIDMat.MatFunctions._ +import BIDMat.SciFunctions._ +import scala.actors._ +import java.io._ + +abstract class DataSource(val opts:DataSource.Options = new DataSource.Options) { + def next:Array[Mat] + def hasNext:Boolean + def reset:Unit + def putBack(mats:Array[Mat],i:Int):Unit = {throw new RuntimeException("putBack not implemented")} + def setupPutBack(n:Int,dim:Int):Unit = {throw new RuntimeException("putBack not implemented")} + def nmats:Int + def init:Unit + def progress:Float + var omats:Array[Mat] = null +} + +class MatDataSource(var mats:Array[Mat], override val opts:MatDataSource.Options = new MatDataSource.Options) extends DataSource(opts) { + var sizeMargin = 0f + var here = 0 + var there = 0 + var blockSize = 0 + var totalSize = 0 + var umat:Mat = null + + omats = null + + def init = { + sizeMargin = opts.sizeMargin + blockSize = opts.blockSize + if (opts.addConstFeat) { + mats(0) = mats(0) on sparse(ones(1, mats(0).ncols)) + } + if (opts.featType == 0) { + mats(0).contents.set(1) + } + here = -blockSize + totalSize = mats(0).ncols + omats = new Array[Mat](mats.length) + for (i <- 0 until mats.length) { + omats(i) = mats(i) match { + case mm:SMat => SMat(mats(i).nrows, blockSize, (mats(i).nnz * sizeMargin * blockSize / mats(i).ncols).toInt) + case mm:SDMat => SDMat(mats(i).nrows, blockSize, (mats(i).nnz * sizeMargin * blockSize / mats(i).ncols).toInt) + case _ => mats(i).zeros(mats(i).nrows, blockSize) + } + } + } + + def nmats = omats.length + + def reset = { + here = -blockSize + } + + def next:Array[Mat] = { + here = math.min(here+blockSize, mats(0).ncols) + there = math.min(here+blockSize, mats(0).ncols) + for (i <- 0 until mats.length) { + omats(i) = mats(i).colslice(here, there, omats(i)) + } + omats + } + + def hasNext:Boolean = { + here + blockSize < mats(0).ncols + } + + override def setupPutBack(n:Int, dim:Int) = { + if (mats.length < n) { + val newmats = new Array[Mat](n) + for (i <- 0 until n-1) { + newmats(i) = mats(i) + } + newmats(n-1) = ones(dim, mats(0).ncols) + mats = newmats + } + } + + override def putBack(tmats:Array[Mat],i:Int):Unit = { + tmats(i).colslice(0, tmats(i).ncols, mats(i), here) + } + + def progress = { + math.min((here+blockSize)*1f/totalSize, 1f) + } + +} + +class FilesDataSource(override val opts:FilesDataSource.Options = new FilesDataSource.Options) extends DataSource(opts) { + var sizeMargin = 0f + var blockSize = 0 + @volatile var fileno = 0 + var rowno = 0 + var nstart = 0 + var fnames:List[(Int)=>String] = null + omats = null + var matqueue:Array[Array[Mat]] = null + var ready:IMat = null + var stop:Boolean = false + var permfn:(Int)=>Int = null + var totalSize = 0 + + def softperm(nstart:Int, nend:Int) = { + val dd1 = nstart / 24 + val hh1 = nstart % 24 + val dd2 = nend / 24 + val hh2 = nend % 24 + val (dmy, ii) = sort2(rand(dd2-dd1+1+opts.lookahead)) + (n:Int) => { + val dd = n / 24 + val hh = n % 24 + val ddx = ii(dd-dd1)+dd1 + val ddx0 = ddx % 31 + val ddx1 = ddx / 31 + val hhdd = hh + 24 * (ddx0 - 1) + (ddx1 * 31 + (hhdd % 31 + 1)) * 24 + hhdd / 31 + } + } + + def initbase = { + nstart = opts.nstart + fnames = opts.fnames + blockSize = opts.blockSize + while (!fileExists(fnames(0)(nstart))) {nstart += 1} + if (opts.order == 1) { + val (dmy, rr) = sort2(rand(opts.nend+opts.lookahead+1-nstart,1)) // Randomize the file read order + permfn = (a:Int) => rr(a-nstart)+nstart + } else { + permfn = (n:Int) => { // Stripe reads across disks (different days) + val (yy, mm, dd, hh) = FilesDataSource.decodeDate(n) + val hhdd = hh + 24 * (dd - 1) + FilesDataSource.encodeDate(yy, mm, hhdd % 31 + 1, hhdd / 31) + } + } + fileno = nstart // Number of the current output file + rowno = 0 // row number in the current output file + totalSize = opts.nend - nstart + matqueue = new Array[Array[Mat]](opts.lookahead) // Queue of matrices for each output matrix + ready = -iones(opts.lookahead, 1) // Numbers of files currently loaded in queue + for (i <- 0 until opts.lookahead) { + matqueue(i) = new Array[Mat](fnames.size) + } + for (i <- 0 until opts.lookahead) { + Actor.actor { + prefetch(nstart + i) + } + } + } + + def reset = { + fileno = nstart + rowno = 0 + for (i <- 0 until opts.lookahead) { + val ifile = nstart + i + val ifilex = ifile % opts.lookahead + ready(ifilex) = ifile - opts.lookahead + } + } + + def init = { + initbase + omats = new Array[Mat](fnames.size) + for (i <- 0 until fnames.size) { + var mm = HMat.loadMat(fnames(i)(nstart)) + if (opts.dorows) { + omats(i) = mm.zeros(blockSize, mm.ncols) + } else { + omats(i) = mm.zeros(mm.nrows, blockSize) + } + } + } + + def progress = { + (fileno-nstart)*1f / totalSize + } + + def nmats = omats.length + + def next:Array[Mat] = { + var donextfile = false + var todo = blockSize + while (todo > 0 && fileno < opts.nend) { + var nrow = rowno + val filex = fileno % opts.lookahead + while (ready(filex) < fileno) Thread.sleep(1) + for (i <- 0 until fnames.size) { + val matq = matqueue(filex)(i) + if (matq != null) { + val matqnr = if (opts.dorows) matq.nrows else matq.ncols + nrow = math.min(rowno + todo, matqnr) + if (opts.dorows) { + omats(i) = matq.rowslice(rowno, nrow, omats(i), blockSize - todo) + } else { + omats(i) = matq.colslice(rowno, nrow, omats(i), blockSize - todo) + } + if (matqnr == nrow) donextfile = true + } else { + donextfile = true + } + } + todo -= nrow - rowno + if (donextfile) { + fileno += 1 + rowno = 0 + donextfile = false + } else { + rowno = nrow + } + } + omats + } + + def fileExists(fname:String) = { + val testme = new File(fname) + testme.exists + } + + def lazyTranspose(a:Mat) = { + a match { + case af:FMat => FMat(a.ncols, a.nrows, af.data) + case ad:DMat => DMat(a.ncols, a.nrows, ad.data) + case ai:IMat => IMat(a.ncols, a.nrows, ai.data) + case _ => throw new RuntimeException("laztTranspose cant deal with "+a.getClass.getName) + } + } + + def prefetch(ifile:Int) = { + val ifilex = ifile % opts.lookahead + ready(ifilex) = ifile - opts.lookahead + while (!stop) { + while (ready(ifilex) >= fileno) Thread.sleep(1) + val inew = ready(ifilex) + opts.lookahead + val pnew = permfn(inew) + val fexists = fileExists(fnames(0)(pnew)) && (rand(1,1).v < opts.sampleFiles) + for (i <- 0 until fnames.size) { + matqueue(ifilex)(i) = if (fexists) { + HMat.loadMat(fnames(i)(pnew), matqueue(ifilex)(i)) + } else null +// println("%d" format inew) + } + ready(ifilex) = inew + } + } + + def hasNext:Boolean = { + (fileno < opts.nend) + } + +} + +class SFilesDataSource(override val opts:SFilesDataSource.Options = new SFilesDataSource.Options) extends FilesDataSource(opts) { + + var inptrs:IMat = null + var offsets:IMat = null + + override def init = { + initbase + var totsize = sum(opts.fcounts).v + if (opts.addConstFeat) totsize += 1 + omats = new Array[Mat](1) + omats(0) = SMat(totsize, opts.blockSize, opts.blockSize * opts.eltsPerSample) + inptrs = izeros(opts.fcounts.length, 1) + offsets = 0 on cumsum(opts.fcounts) + } + + def binFind(i:Int, mat:Mat):Int = { + val imat = mat.asInstanceOf[IMat] + val nrows = mat.nrows + var ibeg = 0 + var iend = nrows + while (ibeg < iend) { + val imid = (iend + ibeg)/2 + if (i > imat(imid, 0)) { + ibeg = imid+1 + } else { + iend = imid + } + } + iend + } + + def sprowslice(inmat:Array[Mat], rowno:Int, nrow:Int, omat0:Mat, done:Int):Mat = { + val omat = omat0.asInstanceOf[SMat] + val ioff = Mat.ioneBased + var idone = done + var innz = omat.nnz + val lims = opts.fcounts + val nfiles = opts.fcounts.length + val addConstFeat = opts.addConstFeat + val featType = opts.featType + var j = 0 + while (j < nfiles) { + inptrs(j, 0) = binFind(rowno, inmat(j)) + j += 1 + } + var irow = rowno + while (irow < nrow) { + var j = 0 + while (j < nfiles) { + val mat = inmat(j).asInstanceOf[IMat] + val mrows = mat.nrows + var k = inptrs(j) + while (k < mrows && mat.data(k) < irow) k += 1 + inptrs(j) = k + val xoff = innz - k + val yoff = offsets(j) + ioff + while (k < mat.nrows && mat.data(k) == irow && mat.data(k+mrows) < lims(j)) { + omat.ir(xoff + k) = mat.data(k+mrows) + yoff + omat.data(xoff + k) = if (featType == 0) 1f else mat.data(k+2*mrows) + k += 1 + } + innz = xoff + k + inptrs(j) = k + j += 1 + } + irow += 1 + idone += 1 + if (addConstFeat) { + omat.ir(innz) = omat.nrows - 1 + ioff + omat.data(innz) = 1 + innz += 1 + } + omat.jc(idone) = innz + ioff + } + omat.nnz0 = innz + omat + } + + def spmax(matq:Array[Mat]):Int = { + var maxv = 0 + for (i <- 0 until matq.length) { + if (matq(i) != null) { + val mat = matq(i).asInstanceOf[IMat] + maxv = math.max(maxv, mat(mat.nrows-1,0)) + } + } + maxv + } + + def fillup(mat:Mat, todo:Int) = { + val smat = mat.asInstanceOf[SMat] + val ncols = mat.ncols + var i = ncols - todo + val theend = smat.jc(i) + while (i < ncols) { + i += 1 + smat.jc(i) = theend + } + } + + def flushMat(mat:Mat) = { + val smat = mat.asInstanceOf[SMat] + smat.nnz0 = 0 + smat.jc(0) = Mat.ioneBased + } + + override def next:Array[Mat] = { + var donextfile = false + var todo = blockSize + flushMat(omats(0)) + while (todo > 0 && fileno < opts.nend) { + var nrow = rowno + val filex = fileno % opts.lookahead + while (ready(filex) < fileno) Thread.sleep(1) + val spm = spmax(matqueue(filex)) + nrow = math.min(rowno + todo, spm) + val matq = matqueue(filex) + if (matq(0) != null) { + omats(0) = sprowslice(matq, rowno, nrow, omats(0), blockSize - todo) + if (spm == nrow) donextfile = true + } else { + donextfile = true + } + todo -= nrow - rowno + if (donextfile) { + fileno += 1 + rowno = 0 + donextfile = false + } else { + rowno = nrow + } + } + if (todo > 0) { + fillup(omats(0), todo) + } + omats + } + +} + +class BlendedDataSource(val s1:DataSource, val s2:DataSource, var alpha:Float, var samp1:Float, var samp2:Float, + override val opts:BlendedDataSource.Options = new BlendedDataSource.Options) extends DataSource(opts) { + var sizeMargin = 0f + var here = 0L + var there = 0 + var iptr1 = 0 + var iptr2 = 0 + var blockSize = 0 + var bBlock = 0 + var totalSize = 0 + var randv:FMat = null + var rands1:FMat = null + var rands2:FMat = null + var mats1:Array[Mat] = null + var mats2:Array[Mat] = null + omats = null + + def init = { + sizeMargin = opts.sizeMargin + blockSize = opts.blockSize + bBlock = opts.bBlock + randv = rand(1, blockSize/bBlock + 1) + rands1 = rand(1, blockSize/bBlock + 1) + rands2 = rand(1, blockSize/bBlock + 1) + here = -blockSize + s1.opts.addConstFeat = opts.addConstFeat + s2.opts.addConstFeat = opts.addConstFeat + s1.opts.featType = opts.featType + s2.opts.featType = opts.featType + s1.init + s2.init + mats1 = s1.next + mats2 = s2.next + totalSize = mats1(0).ncols + omats = new Array[Mat](mats1.length) + for (i <- 0 until mats1.length) { + omats(i) = mats1(i) match { + case mm:SMat => SMat(mats1(i).nrows, blockSize, (mats1(i).nnz * sizeMargin).toInt) + case mm:SDMat => SDMat(mats1(i).nrows, blockSize, (mats1(i).nnz * sizeMargin).toInt) + case _ => mats1(i).zeros(mats1(i).nrows, blockSize) + } + } + } + + def nmats = omats.length + + def reset = { + s1.reset + s2.reset + here = -blockSize + } + + @inline def copycol(inmats:Array[Mat], iptr:Int, jptr:Int, omats:Array[Mat], here:Int) = { + var imat = 0 + while (imat < inmats.length) { + omats(imat) = inmats(imat).colslice(iptr, jptr, omats(imat), here) + imat += 1 + } + } + + def next:Array[Mat] = { + rand(0, 1f, randv) + var i = 0 + var xptr = 0 + while (xptr < blockSize && hascol(mats1, iptr1, s1) && hascol(mats2, iptr2, s2)) { + if (randv.data(i) < alpha) { + while (iptr1 < mats1(0).ncols && rands1.data(iptr1/bBlock) > samp1) iptr1 += bBlock + if (iptr1 >= mats1(0).ncols) { + mats1 = s1.next + iptr1 = 0 + rand(0, 1f, samp1) + } + val jptr1 = math.min(mats1(0).ncols, iptr1 + math.min(bBlock, math.min(blockSize, omats(0).ncols) - xptr)) + copycol(mats1, iptr1, jptr1, omats, xptr) + xptr += jptr1 - iptr1 + iptr1 = jptr1 + } else { + while (iptr2 < mats2(0).ncols && rands2.data(iptr2/bBlock) > samp2) iptr2 += bBlock + if (iptr2 >= mats2(0).ncols) { + mats2 = s2.next + iptr2 = 0 + rand(0, 1f, samp2) + } + val jptr2 = math.min(mats1(0).ncols, iptr2 + math.min(bBlock, math.min(blockSize, omats(0).ncols) - xptr)) + copycol(mats1, iptr2, jptr2, omats, xptr) + xptr += jptr2 - iptr2 + iptr2 = jptr2 + } + i += 1 + } + here += xptr + if (xptr == blockSize) { + omats + } else { + shrinkmats(omats, i) + } + } + + def hascol(mats:Array[Mat], iptr:Int, ss:DataSource):Boolean = { + (iptr < mats(0).ncols) || ss.hasNext + } + + def hasNext:Boolean = { + hascol(mats1, iptr1, s1) && hascol(mats2, iptr2, s2) + } + + def shrinkmats(xmats:Array[Mat], n:Int) = { + val outarr = new Array[Mat](omats.length) + var imat = 0 + while (imat < omats.length) { + outarr(imat) = xmats(imat).colslice(0, n, null) + imat += 1 + } + outarr + } + + def progress = { + math.max(s1.progress, s2.progress) + } +} + + +object DataSource { + class Options { + var blockSize = 100000 + var sizeMargin = 3f + var sample = 1f + var addConstFeat:Boolean = false + var featType:Int = 1 // 0 = binary features, 1 = linear features + } +} + +object MatDataSource { + class Options extends DataSource.Options { + + } +} + + +object FilesDataSource { + + def encodeDate(yy:Int, mm:Int, dd:Int, hh:Int) = (((12*yy + mm) * 31) + dd)*24 + hh + + def decodeDate(n:Int):(Int, Int, Int, Int) = { + val days = n / 24 + val dd = (days - 1) % 31 + 1 + val months = (days - dd) / 31 + val mm = (months - 1) % 12 + 1 + val yy = (months - mm) / 12 + (yy, mm, dd, n % 24) + } + + def sampleFun(fname:String):(Int)=>String = { + (n:Int) => { + val (yy, mm, dd, hh) = decodeDate(n) + (fname format ((n / 24) % 16, yy, mm, dd, hh)) + } + } + + def sampleFun(fname:String, m:Int, i:Int):(Int)=>String = { + (n0:Int) => { + val n = n0 * m + i + val (yy, mm, dd, hh) = decodeDate(n) + (fname format ((n / 24) % 16, yy, mm, dd, hh)) + } + } + + + class Options extends DataSource.Options { + val localDir:String = "" + def fnames:List[(Int)=>String] = null + var lookahead = 8 + var sampleFiles = 1.0f + var nstart:Int = 0 + var nend:Int = 0 + var dorows:Boolean = true + var order:Int = 1 // 0 = sequential order, 1 = random + } +} + +object BlendedDataSource { + class Options extends DataSource.Options { + var bBlock = 1000 + } +} + +object SFilesDataSource { + class Options extends FilesDataSource.Options { + var fcounts:IMat = null + var eltsPerSample = 0 + } + + val twitterFeatureDir = "/disk%02d/twitter/featurized/%04d/%02d/%02d/" + val twitterSmileyFeatureDir = "/disk%02d/twitter/smiley/featurized/%04d/%02d/%02d/" + + def twitterWords( + nstart0:Int = FilesDataSource.encodeDate(2012,3,1,0), + nend0:Int = FilesDataSource.encodeDate(2012,12,1,0), + n:Int = 1, + i:Int = 0, + nfeats:Int = 100000) = { + val opts = new SFilesDataSource.Options { + override def fnames:List[(Int)=>String] = List(FilesDataSource.sampleFun(twitterFeatureDir + "unifeats%02d.lz4", n, i)) + fcounts = icol(nfeats) + nstart = nstart0/n + nend = nend0/n + order = 1 + blockSize = 100000 + eltsPerSample = 40 + lookahead = 3 + } + new SFilesDataSource(opts) + } + + def twitterSmileyWords( + nstart0:Int = FilesDataSource.encodeDate(2012,3,1,0), + nend0:Int = FilesDataSource.encodeDate(2013,7,1,0), + n:Int = 1, + i:Int = 0, + nfeats:Int = 100000) = { + val opts = new SFilesDataSource.Options { + override def fnames:List[(Int)=>String] = List(FilesDataSource.sampleFun(twitterSmileyFeatureDir + "unifeats%02d.lz4", n, i)) + fcounts = icol(nfeats) + nstart = nstart0/n + nend = nend0/n + order = 1 + blockSize = 100000 + eltsPerSample = 40 + lookahead = 3 + } + new SFilesDataSource(opts) + } + + def twitterNgrams( + nstart0:Int = FilesDataSource.encodeDate(2012,3,1,0), + nend0:Int = FilesDataSource.encodeDate(2012,12,1,0), + n:Int = 1, + i:Int = 0, + nuni0:Int = 50, + nbi0:Int = 100, + ntri0:Int = 200) = { + val opts = new SFilesDataSource.Options { + override def fnames:List[(Int)=>String] = List( + FilesDataSource.sampleFun(twitterFeatureDir + "unifeats%02d.lz4", n, i), + FilesDataSource.sampleFun(twitterFeatureDir + "bifeats%02d.lz4", n, i), + FilesDataSource.sampleFun(twitterFeatureDir + "trifeats%02d.lz4", n, i) + ) + fcounts = icol(nuni0*1000,nbi0*1000,ntri0*1000) + nstart = nstart0/n + nend = nend0/n + order = 1 + blockSize = 100000 + eltsPerSample = 40 + lookahead = 3 + } + new SFilesDataSource(opts) + } + + def twitterSmileyNgrams( + nstart0:Int = FilesDataSource.encodeDate(2012,3,1,0), + nend0:Int = FilesDataSource.encodeDate(2013,7,1,0), + n:Int = 1, + i:Int = 0, + nuni0:Int = 50, + nbi0:Int = 100, + ntri0:Int = 200) = { + val opts = new SFilesDataSource.Options { + override def fnames:List[(Int)=>String] = List( + FilesDataSource.sampleFun(twitterSmileyFeatureDir + "unifeats%02d.lz4", n, i), + FilesDataSource.sampleFun(twitterSmileyFeatureDir + "bifeats%02d.lz4", n, i), + FilesDataSource.sampleFun(twitterSmileyFeatureDir + "trifeats%02d.lz4", n, i) + ) + fcounts = icol(nuni0*1000,nbi0*1000,ntri0*1000) + nstart = nstart0/n + nend = nend0/n + order = 1 + blockSize = 100000 + eltsPerSample = 40 + lookahead = 3 + } + new SFilesDataSource(opts) + } + + def twitterWordBlend( + nstart0:Int = FilesDataSource.encodeDate(2012,3,1,0), + nend0:Int = FilesDataSource.encodeDate(2013,7,1,0), + n:Int = 1, + i:Int = 0, + nfeats:Int = 10000) = { + val ds1 = twitterWords(nstart0, nend0, n, i, nfeats) + val ds2 = twitterSmileyWords(nstart0, nend0, n, i, nfeats) + if (n > 1) { + ds1.opts.lookahead = 2 + ds2.opts.lookahead = 2 + } + val opts3 = new BlendedDataSource.Options + new BlendedDataSource(ds1, ds2, 0.5f, 1f, 1f, opts3) + } + + def twitterNgramBlend( + nstart0:Int = FilesDataSource.encodeDate(2012,3,1,0), + nend0:Int = FilesDataSource.encodeDate(2013,7,1,0), + n:Int = 1, + i:Int = 0, + nuni0:Int = 50, + nbi0:Int = 100, + ntri0:Int = 200) = { + val ds1 = twitterNgrams(nstart0, nend0, n, i, nuni0, nbi0, ntri0) + val ds2 = twitterSmileyNgrams(nstart0, nend0, n, i, nuni0, nbi0, ntri0) + if (n > 1) { + ds1.opts.lookahead = 2 + ds2.opts.lookahead = 2 + } + val opts3 = new BlendedDataSource.Options + new BlendedDataSource(ds1, ds2, 0.7f, 1f, 1f, opts3) + } + + def testSources(nthreads:Int=4,ff:(Int,Int,Int,Int,Int)=>DataSource = twitterWords, nfeats:Int=100000):IMat = { + val nstart0 = FilesDataSource.encodeDate(2012,3,22,0) + val nend0 = FilesDataSource.encodeDate(2013,7,1,0) + var bytes = 0L + var done = 0L + var step = 10000000000L + var stop = izeros(1,1) + tic + for (i <- 0 until nthreads) { + scala.actors.Actor.actor { + val ss = ff(nstart0, nend0, nthreads, i, nfeats) + ss.init + while (ss.hasNext && stop.v != 1) { + val a = ss.next + bytes += 12L*a(0).nnz + if (bytes > done + step) { + done = (bytes/step)*step + val t=toc + println("GB=%4.2f, t=%4.2f, MB/s=%4.2f" format (bytes/1e9, t, bytes/t/1e6)) + } + } + val t = toc + println("Thread %d done, GB=%4.2f, t=%4.2f, MB/s=%4.2f" format (i, bytes/1e9, t, bytes/t/1e6)) + } + } + stop + } +} + diff --git a/src/main/scala/BIDMach/Distance.scala b/src/main/scala/BIDMach/Distance.scala new file mode 100644 index 00000000..ec94dda8 --- /dev/null +++ b/src/main/scala/BIDMach/Distance.scala @@ -0,0 +1,125 @@ +package BIDMach +import BIDMat.{Mat,BMat,CMat,DMat,FMat,IMat,HMat,GMat,GIMat,GSMat,SMat,SDMat} +import BIDMat.MatFunctions._ +import BIDMat.SciFunctions._ + +abstract class Distance(val opts:Distance.Options = new Distance.Options) { + + def dists(a:FMat):FMat + def dists(a:FMat, b:FMat):FMat + +} + +object Distance { + class Options { + + } +} + + +//Euclidean Distance +class euclidDistance(override val opts:euclidDistance.Options = new euclidDistance.Options) extends Distance(opts) { + + override def dists(a:FMat):FMat = { + val dd = if (Mat.hasCUDA > 0) a xTG a else a xT a; + val d1 = getdiag(dd) + dd ~ dd * 2.0f + dd ~ d1 - dd + dd ~ dd + (d1.t) + max(dd, 0f, dd) + sqrt(dd, dd) + dd + } + + override def dists(a:FMat, b:FMat):FMat = { + val aa = getdiag(if (Mat.hasCUDA > 0) a xTG a else a xT a); + val bb = getdiag(if (Mat.hasCUDA > 0) b xTG b else b xT b); + val ab = if (Mat.hasCUDA > 0) a xTG b else a xT b; + ab ~ ab * 2.0f + ab ~ aa - ab + (bb.t) + max(ab, 0f, ab) + sqrt(ab, ab) + + ab + } +} + +object euclidDistance { + class Options extends Distance.Options { + + } +} + + +//Cosangle Distance +class cosangleDistance(override val opts:cosangleDistance.Options = new cosangleDistance.Options) extends Distance(opts) { + + override def dists(a:FMat):FMat = { + val dd = if (Mat.hasCUDA > 0) a xTG a else a xT a; + var d1 = getdiag(dd) + sqrt(d1, d1) + d1 = if (Mat.hasCUDA > 0) 0.0f+(d1 xTG d1) else 0.0f+(d1 xT d1); + dd ~ 1 - dd / d1 + dd + } + + override def dists(a:FMat, b:FMat):FMat = { + val aa = getdiag(if (Mat.hasCUDA > 0) a xTG a else a xT a); + val bb = getdiag(if (Mat.hasCUDA > 0) b xTG b else b xT b); + val ab = if (Mat.hasCUDA > 0) a xTG b else a xT b; + + sqrt(aa, aa) + sqrt(bb, bb) + + val dd = if (Mat.hasCUDA > 0) 0.0f+(aa xTG bb) else 0.0f+(aa xT bb); + ab ~ 1 - ab / dd + ab + } +} + +object cosangleDistance { + class Options extends Distance.Options { + + } +} + + +//Correlation Distance +class correlationDistance(override val opts:correlationDistance.Options = new correlationDistance.Options) extends Distance(opts) { + + override def dists(a:FMat):FMat = { + val mu = mean(a,1) + a ~ a - mu + val dd = if (Mat.hasCUDA > 0) a xTG a else a xT a; + var d1 = getdiag(dd) + sqrt(d1, d1) + d1 = if (Mat.hasCUDA > 0) 0+(d1 xTG d1) else 0+(d1 xT d1); + dd ~ 1 - dd / d1 + dd + } + + override def dists(a:FMat, b:FMat):FMat = { + val mua = mean(a,1) + a ~ a - mua + val mub = mean(b,1) + b ~ b - mub + + val aa = getdiag(if (Mat.hasCUDA > 0) a xTG a else a xT a); + val bb = getdiag(if (Mat.hasCUDA > 0) b xTG b else b xT b); + val ab = if (Mat.hasCUDA > 0) a xTG b else a xT b; + + sqrt(aa, aa) + sqrt(bb, bb) + + val dd = if (Mat.hasCUDA > 0) 0.0f+(aa xTG bb) else 0.0f+(aa xT bb); + ab ~ 1 - ab / dd + ab + } +} + +object correlationDistance { + class Options extends Distance.Options { + + } +} + diff --git a/src/main/scala/BIDMach/Experiments.scala b/src/main/scala/BIDMach/Experiments.scala new file mode 100755 index 00000000..82e30a34 --- /dev/null +++ b/src/main/scala/BIDMach/Experiments.scala @@ -0,0 +1,264 @@ +package BIDMach +import BIDMat.{Mat,BMat,CMat,CSMat,Dict,DMat,FMat,IDict,IMat,HMat,GMat,GIMat,GSMat,SMat,SDMat} +import BIDMat.MatFunctions._ +import BIDMat.SciFunctions._ +import scala.actors._ +import java.io._ + + +object Twitter { + + def dodicts(threshold:Int=10, rebuild:Boolean=false):Unit = { + val stokdir = "/twitter/smiley/tokenized/" + val tokdir = "/twitter/tokenized/" + val dy1 = mergedicts(2011, 2013, "/disk%02d" + stokdir, "/big" + stokdir, threshold, rebuild) + val dy2 = mergedicts(2011, 2013, "/disk%02d" + tokdir, "/big" + tokdir, threshold, rebuild) + val dy = Dict.union(dy1, dy2) + val (sv, iv) = sortdown2(dy.counts) + HMat.saveBMat("/big"+tokdir+"alldict.gz", BMat(dy.cstr(iv))) + HMat.saveDMat("/big"+tokdir+"allwcount.gz", sv) + } + + def mergedicts(year1:Int, year2:Int, infname:String, outfname:String, threshold:Int=10, rebuild:Boolean=false):Dict = { + val dd = new Array[Dict](6) + val md = new Array[Dict](6) + val yd = new Array[Dict](5) + var dy:Dict = null + var nmerged = 0 + for (yy <- year1 to year2) { + for (mm <- 1 to 12) { + print("\n%d/%02d" format (yy, mm)) + val ff = new File(outfname + "%04d/%02d/wcount.gz" format (yy, mm)) + if (rebuild || ! ff.exists) { + var ndone = 0 + for (id <- 1 to 31) { + var ielem = 372*yy + 31*mm + id + var idisk = ielem % 16 + val fname = (infname + "%04d/%02d/%02d/" format (idisk, yy, mm, id)) + val ff = new File(fname + "wcount.gz") + if (ff.exists) { + val bb = HMat.loadBMat(fname + "dict.gz") + val cc = HMat.loadIMat(fname + "wcount.gz") + dd(ndone % 6) = Dict(bb, cc, threshold) + ndone = ndone + 1 + print("-") + if (ndone % 6 == 0) { + md(ndone / 6 - 1) = Dict.union(dd:_*) + print("+") + } + } + } + if (ndone % 6 != 0) { + md(ndone / 6) = Dict.union(dd.slice(0, ndone % 6):_*) + print("+") + } + if (ndone > 0) { + val dx = Dict.union(md.slice(0, (ndone-1)/6+1):_*) + val (sv, iv) = sortdown2(dx.counts) + val dxx = Dict(dx.cstr(iv), sv) + HMat.saveBMat(outfname + "%04d/%02d/dict.gz" format (yy, mm), BMat(dxx.cstr)) + HMat.saveDMat(outfname + "%04d/%02d/wcount.gz" format (yy, mm), dxx.counts) + } +// println("") + } + val f2 = new File(outfname + "%04d/%02d/wcount.gz" format (yy, mm)) + if (f2.exists) { + val bb = HMat.loadBMat(outfname + "%04d/%02d/dict.gz" format (yy, mm)) + val cc = HMat.loadDMat(outfname + "%04d/%02d/wcount.gz" format (yy, mm)) + yd(nmerged % 5) = Dict(bb, cc, 4*threshold) + nmerged += 1 + print("*") + if (nmerged % 5 == 0) { + val dm = Dict.union(yd:_*) + if (nmerged == 5) { + dy = dm + } else { + dy = Dict.union(dy, dm) + } + } + } + } + } + if (nmerged % 5 != 0) { + val dm = Dict.union(yd.slice(0, nmerged % 5):_*) + dy = Dict.union(dy, dm) + } + println + val (sv, iv) = sortdown2(dy.counts) + val dyy = Dict(dy.cstr(iv), sv) + HMat.saveBMat(outfname + "dict.gz", BMat(dyy.cstr)) + HMat.saveDMat(outfname + "wcount.gz", dyy.counts) + dyy + } + + def getDict = { + val bd = loadBMat("/big/twitter/tokenized/alldict.gz") + val bc = loadDMat("/big/twitter/tokenized/allwcount.gz") + Dict(bd, bc) + } + + def getBiDict = { + val bd = loadIMat("/big/twitter/tokenized/allbdict.lz4") + val bc = loadDMat("/big/twitter/tokenized/allbcnts.lz4") + IDict(bd, bc) + } + + def getTriDict = { + val bd = loadIMat("/big/twitter/tokenized/alltdict.lz4") + val bc = loadDMat("/big/twitter/tokenized/alltcnts.lz4") + IDict(bd, bc) + } + + def junk:CSMat = { + csrow("", "", "", "", "", "", "", + "", "", "", "", "", "", "", + "", "", "", "", "", "" + + "", "", "", "", "", "", "", "", + "", "", "", "", + "", "", "", "", "", "", + "", "", "", "", "", "", "", "", "", "", + "", "", "", "", "", "", + "http", "https", "apos", "kml", "amp", "www", "quot", "id", "latitude", "longitude", "latlonbox", "geo", "json") + } + + def findEmoticons(n:Int, dd:Dict) = { + val smiles = csrow(":-)", ":)", ":o)", ":]", ":3", ":c)", ":>", "=]", "8)", "=)", ":}", ":^)", ":っ)") + val laughs = csrow(":-d", ":d", "8-d", "8d", "x-d", "xd", "x-x", "=-d", "=d", "=-3", "=3", "b^d") + val frowns = csrow(">:[", ":-(", ":(", "", ":-c", ":c", ":-<", "", ":っc", ":<", ":-[", ":[", ":{") + val angry = csrow(":-||", ":@", ">:(") + val crying = csrow(":'-(", ":'(", "qq") + val horror = csrow("d:<", "d:", "d8", "d;", "d=", "dx", "v.v", "d-':") + val surprise = csrow(">:o", ":-o", ":o", "°o°", "°o°", ":o", "o_o", "o_0", "o.o", "8-0") + val wink = csrow(";-)", ";)", "*-)", "*)", ";-]", ";]", ";d", ";^)", ":-,") + val all = List(smiles, laughs, frowns, angry, crying, horror, surprise, wink, junk) + val out = zeros(all.length, n) + for (i <- 0 until all.length) { + val mm = all(i) + var j = 0 + while (j < mm.length) { + val k = dd(mm(j)) + if (k >= 0 && k < n) out(i, k) = 1 + j += 1 + } + } + out + } + + def getGramDict(nuni0:Int=50, nbi0:Int=100, ntri0:Int=200, rebuild:Boolean=false):Dict = { + val nuni = nuni0 * 1000 + val nbi = nbi0 * 1000 + val ntri = ntri0 * 1000 + val fname = "/big/twitter/tokenized/dict_%d_%d_%d" format (nuni0, nbi0, ntri0) + if (!rebuild && (new File(fname + "_bmat.lz4").exists) && (new File(fname + "_dmat.lz4").exists)) { + val bm = loadBMat(fname + "_bmat.lz4") + val dm = loadDMat(fname + "_dmat.lz4") + Dict(bm, dm) + } else { + val ud = getDict + val bd = getBiDict + val td = getTriDict + val dd = IDict.gramDict(nuni, nbi, ntri, ud, bd, td) + saveBMat(fname + "_bmat.lz4", BMat(dd.cstr)) + saveDMat(fname + "_dmat.lz4", dd.counts) + dd + } + } + + def getEmoticonMap(nuni0:Int=50, nbi0:Int=100, ntri0:Int=200, rebuild:Boolean=false):FMat = { + val nuni = nuni0 * 1000 + val nbi = nbi0 * 1000 + val ntri = ntri0 * 1000 + val fname = "/big/twitter/tokenized/dict_%d_%d_%d" format (nuni0, nbi0, ntri0) + if (!rebuild && (new File(fname + "_emos.lz4").exists)) { + loadFMat(fname + "_emos.lz4") + } else { + val ud = getDict + val bdt = getBiDict.grams(0->nbi,?) + val tdt = getTriDict.grams(0->ntri,?) + val em = findEmoticons(1 + maxi(irow(nuni) \ maxi(bdt) \ maxi(tdt)).v, ud) + val bv = zeros(em.nrows, nbi) + val tv = zeros(em.nrows, ntri) + for (i <- 0 until em.nrows) { + bv(i, ?) = max(em(i, bdt(?, 0)), em(i, bdt(?, 1))) + tv(i, ?) = max(em(i, tdt(?, 0)), max(em(i, tdt(?, 1)), em(i, tdt(?, 2)))) + } + val emos = em(?, 0->nuni) \ bv(?, 0->nbi) \ tv(?, 0->ntri) + saveFMat(fname + "_emos.lz4", emos) + emos + } + } + + def logisticModelPar( + nstart0:Int = FilesDataSource.encodeDate(2012,3,1,0), + nend0:Int = FilesDataSource.encodeDate(2013,7,1,0), + nuni0:Int = 50, + nbi0:Int = 100, + ntri0:Int = 200 + ) = { + val ds = SFilesDataSource.twitterNgramBlend(nstart0, nend0) +// val ds = SFilesDataSource.twitterWords(nstart0, nend0) + ds.opts.addConstFeat = true + ds.opts.featType = 0 + val gd = getGramDict(nuni0, nbi0, ntri0) + val em = getEmoticonMap(nuni0, nbi0, ntri0) + val nfeats = gd.length + 1 + val mask = (sum(em) == 0f) \ 1 +// val targets = em(0->(em.nrows-1), ?) \ zeros(em.nrows-1,1) + val targets = em(0->1, ?) \ 0 + val ntargets = targets.nrows + val exptsv = col(0.5, 0.6, 0.7, 0.8, 0.9, 1.0) + val exptst = col(0.5, 0.6, 0.7, 0.8, 0.9, 1.0) +// val expts = col(0.5) + val avalues = col(0.1f, 1f, 10f) + val expts1 = ones(avalues.length*ntargets, 1) ⊗ exptsv ⊗ ones(exptst.length, 1) + val expts2 = ones(avalues.length*exptsv.length*ntargets, 1) ⊗ exptst + val alphas = ones(ntargets, 1) ⊗ avalues ⊗ ones(exptst.length*exptsv.length, 1) + val aopts = new ADAGradUpdater.Options + aopts.vecExponent = expts1 + aopts.timeExponent = expts2 + aopts.alpha = alphas + aopts.mask = mask + val gopts = new GLMmodel.Options + gopts.links = iones(expts1.length, 1) + gopts.mask = mask + gopts.targmap = mkdiag(ones(ntargets, 1)) ⊗ ones(expts1.length/ntargets, 1) + gopts.targets = targets + new LearnFParModelx(ds, gopts, GLMmodel.mkGLMmodel _, aopts, GLMmodel.mkUpdater _) + } + + def logisticModel( + mat:SMat, + ntargs:Int = 1, + exptsv:FMat = col(0.4, 0.5, 0.6), + exptst:FMat = col(0.4, 0.5, 0.6), + avalues:FMat = col(0.1, 0.3, 1), + nuni0:Int = 50, + nbi0:Int = 100, + ntri0:Int = 200 + ) = { + val ds = new MatDataSource(Array(mat:Mat)) + val gd = getGramDict(nuni0, nbi0, ntri0) + val em = getEmoticonMap(nuni0, nbi0, ntri0) + val nfeats = gd.length + 1 + val mask = (sum(em) == 0f) \ 1 + val targets0 = em(0->(em.nrows-1), ?) \ zeros(em.nrows-1,1) + val targets = targets0(0->ntargs, ?) + val ntargets = targets.nrows + val expts1 = ones(avalues.length*ntargets, 1) ⊗ exptsv ⊗ ones(exptst.length, 1) + val expts2 = ones(avalues.length*exptsv.length*ntargets, 1) ⊗ exptst + val alphas = ones(ntargets, 1) ⊗ avalues ⊗ ones(exptst.length*exptsv.length, 1) + val aopts = new ADAGradUpdater.Options + aopts.vecExponent = expts1 + aopts.timeExponent = expts2 + aopts.alpha = alphas + aopts.mask = mask + val gopts = new GLMmodel.Options + gopts.links = iones(expts1.length, 1) + gopts.mask = mask + gopts.targmap = mkdiag(ones(ntargets, 1)) ⊗ ones(expts1.length/ntargets, 1) + gopts.targets = targets + Learner(ds, new GLMmodel(gopts), null, new ADAGradUpdater(aopts)) + } + + +} \ No newline at end of file diff --git a/src/main/scala/BIDMach/FactorModel.scala b/src/main/scala/BIDMach/FactorModel.scala new file mode 100755 index 00000000..f9c7df63 --- /dev/null +++ b/src/main/scala/BIDMach/FactorModel.scala @@ -0,0 +1,76 @@ +package BIDMach + +import BIDMat.{Mat,BMat,CMat,DMat,FMat,IMat,HMat,GMat,GIMat,GSMat,SMat,SDMat} +import BIDMat.MatFunctions._ +import BIDMat.SciFunctions._ + +abstract class FactorModel(override val opts:FactorModel.Options) extends Model(opts) { + + override def init(datasource:DataSource) = { + super.init(datasource) + val data0 = mats(0) + val m = size(data0, 1) + val d = opts.dim + val sdat = (sum(data0,2).t + 1.0f).asInstanceOf[FMat] + val sp = sdat / sum(sdat) + println("initial perplexity=%f" format math.exp(- (sp ddot ln(sp))) ) + + val modelmat = rand(d,m) + modelmat ~ modelmat *@ sdat + val msum = sum(modelmat, 2) + modelmat ~ modelmat / msum + modelmats = new Array[Mat](1) + modelmats(0) = if (opts.useGPU && Mat.hasCUDA > 0) GMat(modelmat) else modelmat + datasource.reset + + if (mats.size > 1) { + while (datasource.hasNext) { + mats = datasource.next + val dmat = mats(1) + dmat.set(1.0f/d) + datasource.putBack(mats,1) + } + } + } + + def reuseuser(a:Mat):Mat = { + val out = a match { + case aa:SMat => FMat.newOrCheckFMat(opts.dim, a.ncols, null, a.GUID, "reuseuser".##) + case aa:GSMat => GMat.newOrCheckGMat(opts.dim, a.ncols, null, a.GUID, "reuseuser".##) + } + out.set(1f) + out + } + + def uupdate(data:Mat, user:Mat) + + def mupdate(data:Mat, user:Mat) + + def mupdate2(data:Mat, user:Mat) = {} + + def evalfun(data:Mat, user:Mat):FMat + + def doblock(gmats:Array[Mat], i:Long) = { + val sdata = gmats(0) + val user = if (gmats.length > 1) gmats(1) else reuseuser(gmats(0)) + uupdate(sdata, user) + mupdate(sdata, user) + } + + def evalblock(mats:Array[Mat]):FMat = { + val sdata = gmats(0) + val user = if (gmats.length > 1) gmats(1) else reuseuser(gmats(0)) + uupdate(sdata, user) + evalfun(sdata, user) + } +} + +object FactorModel { + class Options extends Model.Options { + var uiter = 8 + var weps = 1e-10f + var minuser = 1e-8f + } +} + + diff --git a/src/main/scala/BIDMach/Featurizer.scala b/src/main/scala/BIDMach/Featurizer.scala new file mode 100755 index 00000000..f5946d76 --- /dev/null +++ b/src/main/scala/BIDMach/Featurizer.scala @@ -0,0 +1,627 @@ +package BIDMach +import BIDMat.{Mat,BMat,CMat,CSMat,Dict,DMat,FMat,GMat,GIMat,GSMat,HMat,IDict,IMat,SMat,SDMat} +import BIDMat.MatFunctions._ +import BIDMat.SciFunctions._ +import scala.actors._ +import scala.annotation.switch +import java.io._ + +class Featurizer(val opts:Featurizer.Options = new Featurizer.Options) { + + var alldict:Dict = null + var allbdict:IDict = null + var alltdict:IDict = null + + def mergeDicts(rebuild:Int,dictname:String="dict.gz",wcountname:String="wcount.gz"):Dict = { + val dd = new Array[Dict](5) // Big enough to hold log2(days per month) + val nmonths = 2 + (opts.nend - opts.nstart)/31 + val md = new Array[Dict](1+(math.log(nmonths)/math.log(2)).toInt) // Big enough to hold log2(num months) + println("Building monthly dicts for "+opts.thisDir) + for (d <- opts.nstart to opts.nend) { // Conditional on rebuild, merge the dictionaries for each month + val (year, month, day) = Featurizer.decodeDate(d) + val fm = new File(opts.fromMonthDir(d) + wcountname) + if (rebuild > 1 || ! fm.exists) { + val fd = new File(opts.fromDayDir(d) + wcountname) + if (fd.exists) { + val bb = loadBMat(opts.fromDayDir(d) + dictname) + val cc = loadIMat(opts.fromDayDir(d) + wcountname) + Dict.treeAdd(Dict(bb, cc, opts.threshold), dd) + print(".") + } + if (day == 31) { + val dx = Dict.treeFlush(dd) + if (dx != null) { + val (sv, iv) = sortdown2(dx.counts) + val dxx = Dict(dx.cstr(iv), sv) + val fd = new File(opts.fromMonthDir(d)) + if (!fd.exists) fd.mkdirs + saveBMat(opts.fromMonthDir(d)+dictname, BMat(dxx.cstr)) + saveDMat(opts.fromMonthDir(d)+wcountname, dxx.counts) + println("%04d-%02d" format (year,month)) + } + } + } + } + if (rebuild > 0) { + println("Merging monthly dicts for "+opts.thisDir) + for (d <- opts.nstart to opts.nend) { // Conditionally merge all monthly dictionaries + val (year, month, day) = Featurizer.decodeDate(d) + if (day == 31) { + val fm = new File(opts.fromMonthDir(d) + wcountname) + if (fm.exists) { + val bb = loadBMat(opts.fromMonthDir(d) + dictname) + val cc = loadDMat(opts.fromMonthDir(d) + wcountname) + Dict.treeAdd(Dict(bb, cc, 4*opts.threshold), md) + println("%04d-%02d" format (year,month)) + } + } + } + println + val dy = Dict.treeFlush(md) // Get merged dictionary, sort by counts descending + val (sv, iv) = sortdown2(dy.counts) + val dyy = Dict(dy.cstr(iv), sv) + saveBMat(opts.thisDir + dictname, BMat(dyy.cstr)) + saveDMat(opts.thisDir + wcountname, dyy.counts) + dyy + } else { + Dict(loadBMat(opts.thisDir + dictname), loadDMat(opts.thisDir + wcountname)) + } + } + + def mergeIDicts(rebuild:Int = 0, dictname:String="bdict.lz4", wcountname:String="bcnts.lz4", mapit:Boolean=true):IDict = { + println("Building monthly IDicts for " + opts.thisDir + " " + dictname) + if (alldict == null) alldict = Dict(loadBMat(opts.mainDict)) + val dd = new Array[IDict](5) // Big enough to hold log2(days per month) + val nmonths = 2 + (opts.nend - opts.nstart)/31 + val md = new Array[IDict](1+(math.log(nmonths)/math.log(2)).toInt) // Big enough to hold log2(num months) + var dy:IDict = null + var mdict:Dict = null + var domonth:Boolean = false + var lastmonth = 0 + for (d <- opts.nstart to opts.nend) { + val (year, month, day) = Featurizer.decodeDate(d) + if (month != lastmonth) { + val dfname = opts.fromMonthDir(d) + opts.localDict + if (fileExists(dfname)) { + mdict = Dict(loadBMat(dfname)) // Load token dictionary for this month + val fm = new File(opts.fromMonthDir(d) + wcountname) // Did we process this month? + domonth = rebuild > 1 || !fm.exists + } else { + mdict = null + domonth = false + } + lastmonth = month + } + if (domonth) { + val fd = new File(opts.fromDayDir(d) + wcountname) + if (fd.exists) { + val bb = loadIMat(opts.fromDayDir(d) + dictname) // Load IDict info for this day + val cc = loadDMat(opts.fromDayDir(d) + wcountname) + +// Kludge to deal with (old) scanner problem + val ig = find(maxi(bb, 2) < 0x7fffffff) + val bb2 = bb(ig, ?) + val bm = if (mapit) { + val dict = Dict(loadBMat(opts.fromDayDir(d) + opts.localDict)) // Load token dictionary for this day + val map = dict --> mdict // Map from this days tokens to month dictionary + map(bb2) // Map the ngrams + } else { + bb2 + } + val cc2 = cc(ig,0) +// Done kludge + val igood = find(mini(bm, 2) >= 0) // Find the good ones + val bg = bm(igood,?) + val cg = cc2(igood) + val ip = icol(0->igood.length) + sortlexInds(bg, ip) // lex sort them + IDict.treeAdd(IDict(bg, cg(ip), opts.threshold), dd) // accumulate them + print(".") + } + if (day == 31) { // On the last day, save the accumulated results + val dx = IDict.treeFlush(dd) + if (dx != null) { + saveIMat(opts.fromMonthDir(d)+dictname, dx.grams) + saveDMat(opts.fromMonthDir(d)+wcountname, dx.counts) + } + println("%04d-%02d" format (year,month)) + } + } + } + if (rebuild > 0) { + println("Merging monthly IDicts for " + opts.thisDir) + for (d <- opts.nstart to opts.nend) { + val (year, month, day) = Featurizer.decodeDate(d) + if (day == 31) { // Conditionally accumulate monthly dicts + val dfname = opts.fromMonthDir(d) + opts.localDict + if (fileExists(dfname) || ! mapit) { + mdict = if (mapit) Dict(loadBMat(dfname)) else null + val fm = new File(opts.fromMonthDir(d) + wcountname) + if (fm.exists) { + val bb = HMat.loadIMat(opts.fromMonthDir(d) + dictname) // Load the IDict data for this month + val cc = HMat.loadDMat(opts.fromMonthDir(d) + wcountname) + val bm = if (mapit) { + val map = mdict --> alldict + map(bb) // Map to global token dictionary + } else bb + val igood = find(mini(bm, 2) >= 0) // Save the good stuff + val bg = bm(igood,?) + val cg = cc(igood) + val ip = icol(0->igood.length) + sortlexInds(bg, ip) + IDict.treeAdd(IDict(bg, cg(ip), 4*opts.threshold), md) + println("%04d-%02d" format (year,month)) + } + } + } + } + dy = IDict.treeFlush(md) // Final dictionary for the time period + println + val (sv, iv) = sortdown2(dy.counts) // Sort down by ngram frequency + val dyy = IDict(dy.grams(iv,?), sv) + saveIMat(opts.thisDir + dictname, dyy.grams) + saveDMat(opts.thisDir + wcountname, dyy.counts) + dy // Return the lex-sorted dictionary + } else { + val gyy = loadIMat(opts.thisDir + dictname) + val cyy = loadDMat(opts.thisDir + wcountname) + val iperm = icol(0->cyy.length) + sortlexInds(gyy, iperm) + IDict(gyy, cyy(iperm)) + } + } + + + def mkIDicts(rebuild:Int, scanner:Scanner=TwitterScanner) = { // Build ngram dictionaries for each day + val nthreads = math.min(opts.nthreads, math.max(1, Mat.hasCUDA)) + println("Building daily IDicts") + val done = izeros(nthreads,1) + for (ithread <- 0 until nthreads) { + Actor.actor { + if (Mat.hasCUDA > 0) setGPU(ithread+Mat.hasCUDA-nthreads) + val bigramsx = IMat(opts.guessSize, 3) // Temp storage for grams + val trigramsx = IMat(opts.guessSize, 4) + val useridsx = IMat(opts.guessSize/10, 2) + val bdicts = new Array[IDict](5) // Trees to hold partial merges + val tdicts = new Array[IDict](5) + val udicts = new Array[IDict](5) + + for (d <- (opts.nstart+ithread) to opts.nend by nthreads) { + val (year, month, day) = Featurizer.decodeDate(d) + val fname = opts.fromDayDir(d)+opts.localDict + val fnew = opts.fromDayDir(d)+opts.usrCnts // Check if the userid dictionary was built yet + if (fileExists(fname) && (rebuild > 1 || !fileExists(fnew))) { + val dict = Dict(loadBMat(fname)) // load token dictionary for this day + for (ifile <- 0 until 24) { + val fn = opts.fromDayDir(d)+opts.fromFile(ifile) + if (fileExists(fn)) { + val idata = loadIMat(fn) + val (nuni, nbi, ntri, nusers) = scanner.scan(opts, dict, idata, null, bigramsx, trigramsx, useridsx) + val bigrams = bigramsx(0->nbi, 0->2) + val bid = if (nbi > 0) IDict.dictFromData(bigrams) else null + val trigrams = trigramsx(0->ntri, 0->3) + val trid = if (ntri > 0) IDict.dictFromData(trigrams) else null + val userids = useridsx(0->nusers, 0) + val uid = if (nusers > 0) IDict.dictFromData(userids) else null + IDict.treeAdd(bid, bdicts) + IDict.treeAdd(trid, tdicts) + IDict.treeAdd(uid, udicts) + } + } + val bf = IDict.treeFlush(bdicts) + val tf = IDict.treeFlush(tdicts) + val uf = IDict.treeFlush(udicts) + saveIMat(opts.fromDayDir(d) + opts.biDict, bf.grams) + saveDMat(opts.fromDayDir(d) + opts.biCnts, bf.counts) + saveIMat(opts.fromDayDir(d) + opts.triDict, tf.grams) + saveDMat(opts.fromDayDir(d) + opts.triCnts, tf.counts) + saveIMat(opts.fromDayDir(d) + opts.usrDict, uf.grams) + saveDMat(opts.fromDayDir(d) + opts.usrCnts, uf.counts) + print(".") + } + if (ithread == 0 && day/nthreads == 31/nthreads) println("%04d-%02d" format (year,month)) + } + done(ithread,0) = 1 + } + } + while (mini(done).v == 0) Thread.`yield` + } + + def mkUniFeats(map:IMat, gramsx:IMat, ng:Int):IMat = { + val unis = map(gramsx(0->ng, 0)) + val igood = find(unis >= 0) + val gg = unis(igood, 0) + val ggn = gramsx(igood, 1) + val feats = ggn \ gg + sortlex(feats) + val (outr, ix, iy) = uniquerows(feats) + val fcounts = (ix(1->ix.length, 0) on iy.length) - ix + outr \ fcounts + } + + def mkGramFeats(map:IMat, gramsx:IMat, ng:Int, alldict:IDict):IMat = { + val grams = map(gramsx(0->ng, 0->(gramsx.ncols-1))) + val igood = find(mini(grams, 2) >= 0) + val gg = grams(igood,?) + val ggn = gramsx(igood, gramsx.ncols-1) + val gmap = IDict(gg) --> alldict + val igood2 = find(gmap >= 0) + val feats = ggn(igood2,0) \ gmap(igood2,0) + sortlex(feats) + val (outr, ix, iy) = uniquerows(feats) + val fcounts = (ix(1->ix.length, 0) on iy.length) - ix + outr \ fcounts + } + + def featurize(rebuild:Int, scanner:Scanner=TwitterScanner) = { + println("Featurizing in " + opts.thisDir) + if (alldict == null) alldict = Dict(HMat.loadBMat(opts.mainDict)) + if (allbdict == null) allbdict = IDict(HMat.loadIMat(opts.mainBDict)) + if (alltdict == null) alltdict = IDict(HMat.loadIMat(opts.mainTDict)) + alldict.makeHash + allbdict.makeSorted + alltdict.makeSorted + val nthreads = math.min(opts.nthreads, math.max(1, Mat.hasCUDA)) + val done = izeros(nthreads,1) + for (ithread <- 0 until nthreads) { + Actor.actor { + if (Mat.hasCUDA > 0) setGPU(ithread+Mat.hasCUDA-nthreads) + val unigramsx = IMat(opts.guessSize, 2) + val bigramsx = IMat(opts.guessSize, 3) + val trigramsx = IMat(opts.guessSize, 4) + val userids = IMat(opts.guessSize/10, 2) + for (d <- (opts.nstart+ithread) to opts.nend by nthreads) { + val (year, month, day) = Featurizer.decodeDate(d) + val fdict = opts.fromDayDir(d)+opts.localDict + if (fileExists(fdict)) { + var dict:Dict = null + var map:IMat = null + val fd = new File(opts.toDayDir(d)) + if (!fd.exists) fd.mkdirs + for (ifile <- 0 until 24) { + val fn = opts.fromDayDir(d)+opts.fromFile(ifile) + val fx = opts.toDayDir(d)+opts.toTriFeats(ifile) + if (fileExists(fn) && (rebuild > 0 || !fileExists(fx))) { + if (dict == null) { + dict = Dict(loadBMat(fdict)) + map = dict --> alldict + } + val idata = loadIMat(fn) + val (nuni, nbi, ntri, nstatuses) = scanner.scan(opts, dict, idata, unigramsx, bigramsx, trigramsx, userids) + val unifeats = mkUniFeats(map, unigramsx, nuni) + val bifeats = mkGramFeats(map, bigramsx, nbi, allbdict) + val trifeats = mkGramFeats(map, trigramsx, ntri, alltdict) + saveIMat(opts.toDayDir(d) + opts.toUniFeats(ifile), unifeats) + saveIMat(opts.toDayDir(d) + opts.toBiFeats(ifile), bifeats) + saveIMat(opts.toDayDir(d) + opts.toTriFeats(ifile), trifeats) + saveIMat(opts.toDayDir(d) + opts.toUserids(ifile), userids(0->nstatuses, ?)) + if (ifile == 23) print(".") + } + } + } + if (ithread == 0 && day/nthreads == 31/nthreads) println("%04d-%02d" format (year,month)) + } + done(ithread,0) = 1 + } + } + while (mini(done).v == 0) Thread.`yield` + } + + def fileExists(fname:String) = { + val testme = new File(fname) + testme.exists + } + + def loadDicts() = { + if (alldict == null) alldict = Dict(HMat.loadBMat(opts.mainDict)) + if (allbdict == null) allbdict = IDict(HMat.loadIMat(opts.mainBDict)) + if (alltdict == null) alltdict = IDict(HMat.loadIMat(opts.mainTDict)) + val alld = alldict.cstr + val bg = allbdict.grams + val tg = alltdict.grams + val bd = CSMat(bg.nrows,1) + val td = CSMat(tg.nrows,1) + var i = 0 + while (i < bg.nrows) { + bd(i) = alld(bg(i,0)) + " " + alld(bg(i,1)) + i += 1 + } + i = 0 + while (i < tg.nrows) { + td(i) = (alld(tg(i,0)) + " " + alld(tg(i,1))) + (" " + alld(tg(i,2))) + i += 1 + } + (alld, bd, td) + } +} + +object Featurizer { + + def alloptions = { + val ff = new Featurizer + val newopts = new Featurizer.Options{ + override val tokDirName = "twitter/smiley/tokenized/" + override val featDirName = "twitter/smiley/featurized/" + } + val fs = new Featurizer(newopts) + (ff,fs) + } + + /* + * Rebuild levels: + * 0: Incrementally build monthly Dicts and Idicts and featurize any new files. Dont rebuild dictionaries + * 1: Rebuild all dictionaries from monthlies, and rebuild all features. + * 2: Rebuild everything + */ + + def updateDicts(rebuild:Int=0) = { + val (ff,fs) = alloptions + ff.mergeDicts(rebuild) + fs.mergeDicts(rebuild) + ff.mkIDicts(rebuild) + fs.mkIDicts(rebuild) + } + + def buildAll(rebuild:Int=0) = { + buildMainDict(rebuild) + buildMainGDicts(rebuild) + buildFeatures(rebuild) + } + + def buildMainDict(rebuild:Int) = { + val (ff,fs) = alloptions + val d1 = ff.mergeDicts(rebuild) + val d2 = fs.mergeDicts(rebuild) + if (rebuild>0) { + val dd = Dict.union(d1, d2) + val (sc, ic) = sortdown2(dd.counts) + saveBMat(ff.opts.mainDict, BMat(dd.cstr(ic,0))) + saveDMat(ff.opts.mainCounts, sc) + } + } + + def buildMainGDicts(rebuild:Int) = { + val (ff, fs) = alloptions + + val bd1 = ff.mergeIDicts(rebuild) + val bd2 = fs.mergeIDicts(rebuild) + if (rebuild>0) { + val bdd = IDict.merge2(bd1,bd2) + val (sbc, ibc) = sortdown2(bdd.counts) + saveIMat(ff.opts.mainBDict, IMat(bdd.grams(ibc,?))) + saveDMat(ff.opts.mainBCounts, sbc) + } + + val td1 = ff.mergeIDicts(rebuild, "tdict.lz4", "tcnts.lz4") + val td2 = fs.mergeIDicts(rebuild, "tdict.lz4", "tcnts.lz4") + if (rebuild>0) { + val tdd = IDict.merge2(td1,td2) + val (stc, itc) = sortdown2(tdd.counts) + saveIMat(ff.opts.mainTDict, IMat(tdd.grams(itc,?))) + saveDMat(ff.opts.mainTCounts, stc) + } + + ff.opts.threshold = 1 + fs.opts.threshold = 1 + val usr1 = ff.mergeIDicts(rebuild, "usrdict.lz4", "usrcnts.lz4", false) + val usr2 = fs.mergeIDicts(rebuild, "usrdict.lz4", "usrcnts.lz4", false) + if (rebuild>0) { + val usr = IDict.merge2(usr1,usr2) + val (usrs, usrc) = sortdown2(usr.counts) + saveIMat(ff.opts.mainUsrDict, IMat(usr.grams(usrc,?))) + saveDMat(ff.opts.mainUsrCounts, usrs) + } + } + + def buildFeatures(rebuild:Int) = { + val (ff, fs) = alloptions + fs.featurize(rebuild) + ff.featurize(rebuild) + } + + def encodeDate(yy:Int, mm:Int, dd:Int) = (372*yy + 31*mm + dd) + + def decodeDate(n:Int):(Int, Int, Int) = { + val yy = (n - 32) / 372 + val days = n - 32 - 372 * yy + val mm = days / 31 + 1 + val dd = days - 31 * (mm - 1) + 1 + (yy, mm, dd) + } + + def dirxMap(fname:String):(Int)=>String = { + (n:Int) => { + val (yy, mm, dd) = decodeDate(n) + (fname format (n % 16, yy, mm, dd)) + } + } + + def dirMap(fname:String):(Int)=>String = { + (n:Int) => { + val (yy, mm, dd) = decodeDate(n) + (fname format (yy, mm, dd)) + } + } + + + class Options { + val tokDirName = "twitter/tokenized/" + val featDirName = "twitter/featurized/" + val localDict:String = "dict.gz" + val localCount:String = "wcount.gz" + val biDict:String = "bdict.lz4" + val triDict:String = "tdict.lz4" + val usrDict:String = "usrdict.lz4" + val biCnts:String = "bcnts.lz4" + val triCnts:String = "tcnts.lz4" + val usrCnts:String = "usrcnts.lz4" + def thisDir = "/big/" + tokDirName + def mainDir = "/big/twitter/tokenized/" + def mainDict:String = mainDir + "all" + localDict + def mainCounts:String = mainDir + "all" + localCount + def mainBDict:String = mainDir + "all" + biDict + def mainBCounts:String = mainDir + "all" + biCnts + def mainTDict:String = mainDir + "all" + triDict + def mainTCounts:String = mainDir + "all" + triCnts + def mainUsrDict:String = mainDir + "all" + usrDict + def mainUsrCounts:String = mainDir + "all" + usrCnts + def fromYearDir:(Int)=>String = dirMap(thisDir + "%04d/") + def fromMonthDir:(Int)=>String = dirMap(thisDir + "%04d/%02d/") + def fromDayDir:(Int)=>String = dirxMap("/disk%02d/" + tokDirName + "%04d/%02d/%02d/") + def toDayDir:(Int)=>String = dirxMap("/disk%02d/" + featDirName + "%04d/%02d/%02d/") + var fromFile:(Int)=>String = (n:Int) => ("tweet%02d.gz" format n) + var toUniFeats:(Int)=>String = (n:Int) => ("unifeats%02d.lz4" format n) + var toBiFeats:(Int)=>String = (n:Int) => ("bifeats%02d.lz4" format n) + var toTriFeats:(Int)=>String = (n:Int) => ("trifeats%02d.lz4" format n) + var toUserids:(Int)=>String = (n:Int) => ("userids%02d.lz4" format n) + var nstart:Int = encodeDate(2011,11,22) + var nend:Int = encodeDate(2013,6,31) + var threshold = 10 + var guessSize = 200000000 + var nthreads = 1 + } +} + +trait Scanner { + def scan(opts:Featurizer.Options, dict:Dict, idata:IMat, unigramsx:IMat, bigramsx:IMat, trigramsx:IMat, userids:IMat):(Int, Int, Int, Int) +} + +object TwitterScanner extends Scanner { + final val OutsideStatus = 0 + final val InsideStatus = 1 + final val InsideUser = 2 + final val InsideUserId = 3 + final val InsideText = 4 + final val InsideRetweet = 5 + final val InsideStatusL2 = 6 + final val InsideUserL2 = 7 + final val InsideUserIdL2 = 8 + final val InsideTextL2 = 9 + + def scan(opts:Featurizer.Options, dict:Dict, idata:IMat, unigramsx:IMat, bigramsx:IMat, trigramsx:IMat, userids:IMat):(Int, Int, Int, Int) = { + + val Isstart = dict("") + val Isend = dict("") + val Irstart = dict("") + val Irend = dict("") + val Itstart = dict("") + val Itend = dict("") + val Iuser = dict("") + val Iuend = dict("") + val Iistart = dict("") + val Iiend = dict("") + var state = 0 + + var istatus = -1 + var nuni = 0 + var nbi = 0 + var ntri = 0 + var len = idata.length + var i = 0 + while (i < len) { + val tok = idata.data(i)-1 +// if (tok+1 >0) println(dict(tok)+ " " + state) +// else println("num " +(-(tok+1))+ " " + state) + if (tok == Isend) { + state = OutsideStatus + } else { + (state: @switch) match { + case OutsideStatus => + if (tok == Isstart) { + state = InsideStatus + istatus += 1 + } + case InsideStatus => + tok match { + case Iuser => state = InsideUser + case Itstart => state = InsideText + case Irstart => state = InsideRetweet + case _ => {} + } + case InsideUser => + tok match { + case Iistart => state = InsideUserId + case Irstart => state = InsideRetweet + case Iuend => state = InsideStatus + case _ => {} + } + case InsideUserId => + if (tok == Iiend) { + state = InsideUser + } else if (tok+1 < 0) { + if (userids != null) { + userids(istatus,0) = -(tok+1) + userids(istatus,1) = 0 + } + } + case InsideText => + tok match { + case Iuser => state = InsideUser + case Itend => state = InsideStatus + case _ => if (tok+1 > 0) { + if (unigramsx != null) { + unigramsx(nuni, 0) = tok + unigramsx(nuni, 1) = istatus + nuni += 1 + } + if (idata.data(i-1) > 0) { + val tok1 = idata.data(i-1)-1 + if (tok1 != Itstart) { + bigramsx(nbi, 0) = tok1 + bigramsx(nbi, 1) = tok + bigramsx(nbi, 2) = istatus + nbi += 1 + if (idata.data(i-2) > 0) { + val tok2 = idata.data(i-2)-1 + if (tok2 != Itstart) { + trigramsx(ntri, 0) = tok2 + trigramsx(ntri, 1) = tok1 + trigramsx(ntri, 2) = tok + trigramsx(ntri, 3) = istatus + ntri += 1 + } + } + } + } + } + } + case InsideRetweet => + tok match { + case Isstart => state = InsideStatusL2 + case Irend => state = InsideStatus + case _ => {} + } + case InsideStatusL2 => + tok match { + case Iuser => state = InsideUserL2 + case Itstart => state = InsideTextL2 + case _ => {} + } + case InsideUserL2 => + tok match { + case Iistart => state = InsideUserIdL2 + case Iuend => state = InsideStatusL2 + case _ => {} + } + case InsideUserIdL2 => + tok match { + case Iiend => state = InsideUserL2 + case _ => if (tok-1 < 0) { + if (userids != null) userids(istatus, 1) = -(tok+1) + } + } + case InsideTextL2 => + tok match { + case Itend => state = InsideStatusL2 + case Iuser => state = InsideUserL2 + case _ => {} + } + case _ => {} + } + + } + i += 1 + } + (nuni, nbi, ntri, istatus) + } +} \ No newline at end of file diff --git a/src/main/scala/BIDMach/GLMmodel.scala b/src/main/scala/BIDMach/GLMmodel.scala new file mode 100755 index 00000000..65a3a63c --- /dev/null +++ b/src/main/scala/BIDMach/GLMmodel.scala @@ -0,0 +1,195 @@ +package BIDMach + +import BIDMat.{Mat,BMat,CMat,DMat,FMat,IMat,HMat,GMat,GIMat,GSMat,SMat,SDMat} +import BIDMat.MatFunctions._ +import BIDMat.SciFunctions._ +import edu.berkeley.bid.CUMAT + + +class GLMmodel(opts:GLMmodel.Options) extends RegressionModel(opts) { + + var mylinks:Mat = null + + val linkArray = Array[GLMlink](LinearLink, LogisticLink) + + var totflops = 0L + + override def init(datasource:DataSource) = { + super.init(datasource) + mylinks = if (useGPU) GIMat(opts.links) else opts.links + modelmats(0) ~ modelmats(0) ∘ mask + totflops = 0L + for (i <- 0 until opts.links.length) { + totflops += linkArray(opts.links(i)).fnflops + } + } + + def mupdate(in:Mat):FMat = { +// println("model %f" format (mean(mean(modelmats(0)))).dv) + val targs = targets * in + min(targs, 1f, targs) + val alltargs = targmap * targs + val eta = modelmats(0) * in + applymeans(eta, mylinks, eta) +// println("pred %f" format (mean(mean(pred))).dv) +// println("%s %s %s %s %s" format (modelmats(0).mytype, updatemats(0).mytype, alltargs.mytype, pred.mytype, in.mytype)) + val lls = llfun(eta, alltargs, mylinks) + alltargs ~ alltargs - eta + updatemats(0) ~ alltargs *^ in + lls + } + + def applymeans(eta:Mat, links:Mat, out:Mat):Mat = { + (eta, links, out) match { + case (feta:FMat, ilinks:IMat, fout:FMat) => { + Mat.nflops += totflops * feta.ncols + var i = 0 + val out = (feta + 3f) + while (i < feta.ncols) { + var j = 0 + while (j < feta.nrows) { + val fun = linkArray(ilinks(j)).invlinkfn + fout.data(j + i * out.nrows) = fun(feta.data(j + i * feta.nrows)) + j += 1 + } + i += 1 + } + out + } + case (geta:GMat, gilinks:GIMat, gout:GMat) => { + Mat.nflops += totflops * geta.ncols + CUMAT.applymeans(geta.data, gilinks.data, gout.data, geta.nrows, geta.ncols) + out + } + } + } + + def llfun(pred:Mat, targ:Mat, links:Mat):FMat = { + (pred, targ, links) match { + case (fpred:FMat, ftarg:FMat, ilinks:IMat) => { + Mat.nflops += 10L * ftarg.length + var i = 0 + val out = (ftarg + 5f) + while (i < ftarg.ncols) { + var j = 0 + while (j < ftarg.nrows) { + val fun = linkArray(ilinks(j)).likelihoodfn + out.data(j + i * out.nrows) = fun(fpred.data(j + i * ftarg.nrows), ftarg.data(j + i * ftarg.nrows)) + j += 1 + } + i += 1 + } + mean(out,2) + } + case (gpred:GMat, gtarg:GMat, gilinks:GIMat) => { + Mat.nflops += totflops * gpred.ncols + val out = (gpred + 3f) + CUMAT.applylls(gpred.data, gtarg.data, gilinks.data, out.data, gpred.nrows, gpred.ncols) + FMat(mean(out,2)) + } + } + } + +} + + +object LinearLink extends GLMlink { + def link(in:Float) = { + in + } + + def invlink(in:Float) = { + in + } + + def dlink(in:Float) = { + 1.0f + } + + def likelihood(pred:Float, targ:Float) = { + val diff = targ - pred + - diff * diff + } + + override val linkfn = link _ + + override val dlinkfn = dlink _ + + override val invlinkfn = invlink _ + + override val likelihoodfn = likelihood _ + + val fnflops = 2 +} + +object LogisticLink extends GLMlink { + def link(in:Float) = { + math.log(in / (1.0f - in)).toFloat + } + + def invlink(in:Float) = { + if (in > 0) { + val tmp = math.exp(-in) + (1.0 / (1.0 + tmp)).toFloat + } else { + val tmp = math.exp(in) + (tmp / (1.0 + tmp)).toFloat + } + } + + def dlink(in:Float) = { + 1 / (in * (1 - in)) + } + + def likelihood(pred:Float, targ:Float) = { + math.log(targ * pred + (1.0f - targ) * (1.0f - pred) + 1e-20).toFloat + } + + override val linkfn = link _ + + override val dlinkfn = dlink _ + + override val invlinkfn = invlink _ + + override val likelihoodfn = likelihood _ + + val fnflops = 20 +} + +object LinkEnum extends Enumeration { + type LinkEnum = Value + val Linear, Logistic = Value +} + +abstract class GLMlink { + val linkfn:(Float => Float) + val dlinkfn:(Float => Float) + val invlinkfn:(Float => Float) + val likelihoodfn:((Float,Float) => Float) + val fnflops:Int +} + +object GLMmodel { + class Options extends RegressionModel.Options { + var links:IMat = null + } + + def mkGLMmodel(fopts:Model.Options) = { + new GLMmodel(fopts.asInstanceOf[GLMmodel.Options]) + } + + def mkUpdater(nopts:Updater.Options) = { + new ADAGradUpdater(nopts.asInstanceOf[ADAGradUpdater.Options]) + } + + def learnFParx( + nstart:Int=FilesDataSource.encodeDate(2012,3,1,0), + nend:Int=FilesDataSource.encodeDate(2012,12,1,0) + ) = { + new LearnFParModelx( + SFilesDataSource.twitterNgramBlend(nstart, nend, 1, 0), + new GLMmodel.Options, mkGLMmodel _, + new ADAGradUpdater.Options, mkUpdater _) + } +} + diff --git a/src/main/scala/BIDMach/LDAModel.scala b/src/main/scala/BIDMach/LDAModel.scala new file mode 100755 index 00000000..8b445f67 --- /dev/null +++ b/src/main/scala/BIDMach/LDAModel.scala @@ -0,0 +1,118 @@ +package BIDMach + +import BIDMat.{Mat,BMat,CMat,DMat,FMat,IMat,HMat,GMat,GIMat,GSMat,SMat,SDMat} +import BIDMat.MatFunctions._ +import BIDMat.SciFunctions._ + + +class LDAModel(override val opts:LDAModel.Options = new LDAModel.Options) extends FactorModel(opts) { + var mm:Mat = null + var alpha:Mat = null + + var traceMem = false + + override def init(datasource:DataSource) = { + super.init(datasource) + mm = modelmats(0) + modelmats = new Array[Mat](2) + modelmats(0) = mm + modelmats(1) = mm.ones(mm.nrows, 1) + updatemats = new Array[Mat](2) + updatemats(0) = mm.zeros(mm.nrows, mm.ncols) + updatemats(1) = mm.zeros(mm.nrows, 1) + } + + def uupdate(sdata:Mat, user:Mat):Unit = { + if (opts.putBack < 0) user.set(1f) + for (i <- 0 until opts.uiter) { + val preds = DDS(mm, user, sdata) + if (traceMem) println("uupdate %d %d %d, %d %f %d" format (mm.GUID, user.GUID, sdata.GUID, preds.GUID, GPUmem._1, getGPU)) + val dc = sdata.contents + val pc = preds.contents + max(opts.weps, pc, pc) + pc ~ dc / pc + val unew = user ∘ (mm * preds) + opts.alpha + if (traceMem) println("uupdate %d %d %d, %d %d %d %d %f %d" format (mm.GUID, user.GUID, sdata.GUID, preds.GUID, dc.GUID, pc.GUID, unew.GUID, GPUmem._1, getGPU)) + if (opts.exppsi) exppsi(unew, unew) + user <-- unew + } +// println("user %g %g" format (mini(mini(user,1),2).dv, maxi(maxi(user,1),2).dv)) + } + + def mupdate(sdata:Mat, user:Mat):Unit = { + val preds = DDS(mm, user, sdata) + val dc = sdata.contents + val pc = preds.contents + max(opts.weps, pc, pc) + pc ~ dc / pc + val ud = user *^ preds + ud ~ ud ∘ mm + ud ~ ud + opts.beta + updatemats(0) <-- ud + sum(ud, 2, updatemats(1)) + if (traceMem) println("mupdate %d %d %d %d" format (sdata.GUID, user.GUID, ud.GUID, updatemats(0).GUID)) + } + + def evalfun(sdata:Mat, user:Mat):FMat = { + val preds = DDS(mm, user, sdata) + val dc = sdata.contents + val pc = preds.contents + max(opts.weps, pc, pc) + ln(pc, pc) + val sdat = sum(sdata,1) + val mms = sum(mm,2) + val suu = ln(mms ^* user) + if (traceMem) println("evalfun %d %d %d, %d %d %d, %d %f" format (sdata.GUID, user.GUID, preds.GUID, pc.GUID, sdat.GUID, mms.GUID, suu.GUID, GPUmem._1)) + val vv = ((pc ddot dc) - (sdat ddot suu))/sum(sdat,2).dv + row(vv, math.exp(-vv)) + } +} + +object LDAModel { + class Options extends FactorModel.Options { + var LDAeps = 1e-9 + var exppsi = true + var alpha = 0.001f + var beta = 0.0001f + } + + def mkLDAmodel(fopts:Model.Options) = { + new LDAModel(fopts.asInstanceOf[LDAModel.Options]) + } + + def mkUpdater(nopts:Updater.Options) = { + new IncNormUpdater(nopts.asInstanceOf[IncNormUpdater.Options]) + } + + def learn(mat0:Mat) = { + new Learner(new MatDataSource(Array(mat0:Mat)), new LDAModel(), null, new IncNormUpdater(), new Learner.Options) + } + + def learnBatch(mat0:Mat) = { + new Learner(new MatDataSource(Array(mat0:Mat)), new LDAModel(), null, new BatchNormUpdater(), new Learner.Options) + } + + def learnFPar( + nstart:Int=FilesDataSource.encodeDate(2012,3,1,0), + nend:Int=FilesDataSource.encodeDate(2012,12,1,0) + ) = { + new LearnFParModel( + new LDAModel.Options, mkLDAmodel _, + new IncNormUpdater.Options, mkUpdater _, + (n:Int, i:Int) => SFilesDataSource.twitterWords(nstart, nend, n, i) + ) + } + + def learnFParx( + nstart:Int=FilesDataSource.encodeDate(2012,3,1,0), + nend:Int=FilesDataSource.encodeDate(2012,12,1,0) + ) = { + new LearnFParModelx( + SFilesDataSource.twitterWords(nstart, nend), + new LDAModel.Options, mkLDAmodel _, + new IncNormUpdater.Options, mkUpdater _ + ) + } +} + + diff --git a/src/main/scala/BIDMach/Learner.scala b/src/main/scala/BIDMach/Learner.scala new file mode 100755 index 00000000..2eb39f17 --- /dev/null +++ b/src/main/scala/BIDMach/Learner.scala @@ -0,0 +1,509 @@ +package BIDMach +import BIDMat.{Mat,BMat,CMat,DMat,FMat,IMat,HMat,GMat,GIMat,GSMat,SMat,SDMat} +import BIDMat.MatFunctions._ +import BIDMat.SciFunctions._ +import BIDMat.Plotting._ +import scala.collection.immutable.List +import scala.collection.mutable.ListBuffer +import scala.actors.Actor + +case class Learner( + val datasource:DataSource, + val model:Model, + val regularizer:Regularizer, + val updater:Updater, + val opts:Learner.Options = new Learner.Options) { + var results:FMat = null + val dopts:DataSource.Options = datasource.opts + val mopts:Model.Options = model.opts + val ropts:Regularizer.Options = if (regularizer != null) regularizer.opts else null + val uopts:Updater.Options = updater.opts + + def setup = { + datasource match { + case ddm:MatDataSource => { + if (mopts.putBack >= 0) { + ddm.setupPutBack(mopts.putBack+1, mopts.dim) + } + } + case _ => {} + } + datasource.init + model.init(datasource) + updater.init(model) + } + + def init = { + datasource.init + model.init(datasource) + updater.init(model) + } + + def run() = { + flip + var done = false + var ipass = 0 + var here = 0L + var lasti = 0 + var bytes = 0L + updater.clear + val reslist = new ListBuffer[FMat] + val samplist = new ListBuffer[Float] + while (ipass < opts.npasses && ! done) { + var lastp = 0f + datasource.reset + var istep = 0 + println("i=%2d" format ipass) + while (datasource.hasNext) { + val mats = datasource.next + here += datasource.opts.blockSize + bytes += 12L*mats(0).nnz + if ((istep - 1) % opts.evalStep == 0 || ! datasource.hasNext) { + val scores = model.evalblockg(mats) + reslist.append(scores.newcopy) + samplist.append(here) + } else { + model.doblockg(mats, here) + if (regularizer != null) regularizer.compute(here) + updater.update(here) + } + if (model.opts.putBack >= 0) datasource.putBack(mats, model.opts.putBack) + istep += 1 + val dsp = datasource.progress + if (dsp > lastp + opts.pstep && reslist.length > lasti) { + val gf = gflop + lastp = dsp - (dsp % opts.pstep) + print("%5.2f%%, %s, gf=%5.3f, secs=%3.1f, GB=%4.2f, MB/s=%5.2f" format ( + 100f*lastp, + Learner.scoreSummary(reslist, lasti, reslist.length), + gf._1, + gf._2, + bytes*1e-9, + bytes/gf._2*1e-6)) + if (model.useGPU) { + print(", GPUmem=%3.2f" format GPUmem._1) + } + println + lasti = reslist.length + } + } + updater.updateM + ipass += 1 + } + val gf = gflop + println("Time=%5.4f secs, gflops=%4.2f" format (gf._2, gf._1)) + results = Learner.scores2FMat(reslist) on row(samplist.toList) + } +} + +case class ParLearner( + val datasources:Array[DataSource], + val models:Array[Model], + val regularizers:Array[Regularizer], + val updaters:Array[Updater], + val opts:Learner.Options = new Learner.Options) { + + var um:FMat = null + var mm:FMat = null + var results:FMat = null + + def run() = { + flip + val mm0 = models(0).modelmats(0) + mm = zeros(mm0.nrows, mm0.ncols) + um = zeros(mm0.nrows, mm0.ncols) + + @volatile var done = izeros(opts.nthreads, 1) + var ipass = 0 + var istep0 = 0L + var ilast0 = 0L + var bytes = 0L + val reslist = new ListBuffer[FMat] + val samplist = new ListBuffer[Float] + var lastp = 0f + done.clear + for (ithread <- 0 until opts.nthreads) { + Actor.actor { + if (ithread < Mat.hasCUDA) setGPU(ithread) + var here = 0L + updaters(ithread).clear + while (ipass < opts.npasses) { + if (ithread == 0) println("i=%2d" format ipass) + datasources(ithread).reset + var istep = 0 + var lasti = 0 + while (datasources(ithread).hasNext) { + val mats = datasources(ithread).next + here += datasources(ithread).opts.blockSize + for (j <- 0 until mats.length) bytes += 12L * mats(j).nnz + istep += 1 + istep0 += 1 + try { + if (istep % opts.evalStep == 0) { + val scores = models(ithread).synchronized {models(ithread).evalblockg(mats)} + reslist.append(scores) + samplist.append(here) + } else { + models(ithread).synchronized { + models(ithread).doblockg(mats, here) + if (regularizers != null && regularizers(ithread) != null) regularizers(ithread).compute(here) + updaters(ithread).update(here) + } + } + } catch { + case e:Exception => { + print("Caught exception in thread %d %s\nTrying restart..." format (ithread, e.toString)) + restart(ithread) + println("Keep on truckin...") + } + } + Thread.sleep(opts.coolit) + if (models(ithread).opts.putBack >= 0) datasources(ithread).putBack(mats, models(ithread).opts.putBack) +// if (istep % (opts.syncStep/opts.nthreads) == 0) syncmodel(models, ithread) + if (ithread == 0 && datasources(0).progress > lastp + opts.pstep) { + lastp += opts.pstep + val gf = gflop + if (reslist.length > lasti) { + print("%5.2f%%, %s, gf=%5.3f, secs=%3.1f, GB=%4.2f, MB/s=%5.2f" format ( + 100f*lastp, + Learner.scoreSummary(reslist, lasti, reslist.length), + gf._1, + gf._2, + bytes*1e-9, + bytes/gf._2*1e-6)) + if (models(0).useGPU) { + for (i <- 0 until math.min(opts.nthreads, Mat.hasCUDA)) { + setGPU(i) + if (i==0) print(", GPUmem=%3.2f" format GPUmem._1) else print(", %3.2f" format GPUmem._1) + } + } + println + } + lasti = reslist.length + } + } + models(ithread).synchronized {updaters(ithread).updateM} + done(ithread) = ipass + 1 + while (done(ithread) > ipass) Thread.sleep(1) + } + } + } + while (ipass < opts.npasses) { + while (mini(done).v == ipass) { + while (istep0 < ilast0 + opts.syncStep) Thread.sleep(1) + syncmodels(models) + ilast0 += opts.syncStep + } + ipass += 1 + } + val gf = gflop + println("Time=%5.4f secs, gflops=%4.2f, MB/s=%5.2f, GB=%5.2f" format (gf._2, gf._1, bytes/gf._2*1e-6, bytes*1e-9)) + results = Learner.scores2FMat(reslist) on row(samplist.toList) + } + + def syncmodels(models:Array[Model]) = { + for (j <- 0 until models(0).modelmats.length) { + mm.clear + for (i <- 0 until models.length) { + if (i < Mat.hasCUDA) setGPU(i) + models(i).synchronized { + um <-- models(i).modelmats(j) + } + mm ~ mm + um + } + mm ~ mm *@ (1f/models.length) + for (i <- 0 until models.length) { + if (i < Mat.hasCUDA) setGPU(i) + models(i).synchronized { + models(i).modelmats(j) <-- mm + } + } + } + if (0 < Mat.hasCUDA) setGPU(0) + } + + def syncmodel(models:Array[Model], ithread:Int) = { + mm.synchronized { + um <-- models(ithread).modelmats(0) + um ~ um *@ (1f/opts.nthreads) + mm ~ mm *@ (1 - 1f/opts.nthreads) + mm ~ mm + um + models(ithread).modelmats(0) <-- mm + } + } + + def restart(ithread:Int) = { + if (models(0).useGPU) { + resetGPU + Mat.trimCache2(ithread) + } + models(ithread).init(datasources(ithread)) + models(ithread).modelmats(0) <-- mm + updaters(ithread).init(models(ithread)) + } +} + +case class ParLearnerx( + val datasource:DataSource, + val models:Array[Model], + val regularizers:Array[Regularizer], + val updaters:Array[Updater], + val opts:Learner.Options = new Learner.Options) { + + var um:FMat = null + var mm:FMat = null + var results:FMat = null + var cmats:Array[Array[Mat]] = null + + def run() = { + flip + val mm0 = models(0).modelmats(0) + mm = zeros(mm0.nrows, mm0.ncols) + um = zeros(mm0.nrows, mm0.ncols) + cmats = new Array[Array[Mat]](opts.nthreads) + for (i <- 0 until opts.nthreads) cmats(i) = new Array[Mat](datasource.omats.length) + + val done = iones(opts.nthreads, 1) + var ipass = 0 + var here = 0L + var feats = 0L + var lasti = 0 + var bytes = 0L + val reslist = new ListBuffer[FMat] + val samplist = new ListBuffer[Float] + for (i <- 0 until opts.nthreads) { + if (i < Mat.hasCUDA) setGPU(i) + updaters(i).clear + } + while (ipass < opts.npasses) { + datasource.reset + var istep = 0 + var lastp = 0f + println("i=%2d" format ipass) + while (datasource.hasNext) { + for (ithread <- 0 until opts.nthreads) { + if (datasource.hasNext) { + done(ithread) = 0 + val mats = datasource.next + here += datasource.opts.blockSize + feats += mats(0).nnz + bytes += 12L*mats(0).nnz + for (j <- 0 until mats.length) cmats(ithread)(j) = safeCopy(mats(j), ithread) + Actor.actor { + if (ithread < Mat.hasCUDA) setGPU(ithread) + try { + if ((istep + ithread + 1) % opts.evalStep == 0 || !datasource.hasNext ) { + val scores = models(ithread).evalblockg(cmats(ithread)) + reslist.append(scores(0)) + samplist.append(here) + } else { + models(ithread).doblockg(cmats(ithread), here) + if (regularizers != null && regularizers(ithread) != null) regularizers(ithread).compute(here) + updaters(ithread).update(here) + } + } catch { + case e:Exception => { + print("Caught exception in thread %d %s\nTrying restart..." format (ithread, e.toString)) + restart(ithread) + println("Keep on truckin...") + } + } + done(ithread) = 1 + } + } + } + while (mini(done).v == 0) Thread.sleep(1) + Thread.sleep(opts.coolit) + istep += opts.nthreads + if (istep % opts.syncStep == 0) syncmodels(models) + if (datasource.progress > lastp + opts.pstep) { + lastp += opts.pstep + val gf = gflop + if (reslist.length > lasti) { + print("%5.2f%%, %s, gf=%5.3f, secs=%3.1f, GB=%4.2f, MB/s=%5.2f" format ( + 100f*lastp, + Learner.scoreSummary(reslist, lasti, reslist.length), + gf._1, + gf._2, + bytes*1e-9, + bytes/gf._2*1e-6)) + if (models(0).useGPU) { + for (i <- 0 until math.min(opts.nthreads, Mat.hasCUDA)) { + setGPU(i) + if (i==0) print(", GPUmem=%3.2f" format GPUmem._1) else print(", %3.2f" format GPUmem._1) + } + } + println + } + lasti = reslist.length + } + } + println + for (i <- 0 until opts.nthreads) { + if (i < Mat.hasCUDA) setGPU(i); + updaters(i).updateM + } + ipass += 1 + saveAs("/big/twitter/test/results.mat", Learner.scores2FMat(reslist) on row(samplist.toList), "results") + } + val gf = gflop + println("Time=%5.4f secs, gflops=%4.2f, samples=%4.2g, MB/sec=%4.2g" format (gf._2, gf._1, 1.0*here, bytes/gf._2/1e6)) + results = Learner.scores2FMat(reslist) on row(samplist.toList) + if (0 < Mat.hasCUDA) setGPU(0) + } + + def safeCopy(m:Mat, ithread:Int):Mat = { + m match { + case ss:SMat => { + val out = SMat.newOrCheckSMat(ss.nrows, ss.ncols, ss.nnz, null, m.GUID, ithread, "safeCopy".##) + ss.copyTo(out) + } + } + } + + def syncmodels(models:Array[Model]) = { + for (j <- 0 until models(0).modelmats.length) { + mm.clear + for (i <- 0 until models.length) { + if (i < Mat.hasCUDA) setGPU(i) + um <-- models(i).modelmats(j) + mm ~ mm + um + } + mm ~ mm *@ (1f/models.length) + for (i <- 0 until models.length) { + if (i < Mat.hasCUDA) setGPU(i) + models(i).modelmats(j) <-- mm + } + } + if (0 < Mat.hasCUDA) setGPU(0) + } + + def restart(ithread:Int) = { + if (models(0).useGPU) { + resetGPU + Mat.trimCaches(ithread) + } + models(ithread).init(datasource) + models(ithread).modelmats(0) <-- mm + updaters(ithread).init(models(ithread)) + } +} + + +class LearnFParModel( + val mopts:Model.Options, + mkmodel:(Model.Options)=>Model, + val uopts:Updater.Options, + mkupdater:(Updater.Options)=>Updater, + ddfun:(Int,Int)=>DataSource + ) { + var dds:Array[DataSource] = null + var models:Array[Model] = null + var updaters:Array[Updater] = null + var learner:ParLearner = null + var lopts = new Learner.Options + + def setup = { + dds = new Array[DataSource](lopts.nthreads) + models = new Array[Model](lopts.nthreads) + updaters = new Array[Updater](lopts.nthreads) + for (i <- 0 until lopts.nthreads) { + if (i < Mat.hasCUDA) setGPU(i) + dds(i) = ddfun(lopts.nthreads, i) + dds(i).init + models(i) = mkmodel(mopts) + models(i).init(dds(i)) + updaters(i) = mkupdater(uopts) + updaters(i).init(models(i)) + } + if (0 < Mat.hasCUDA) setGPU(0) + learner = new ParLearner(dds, models, null, updaters, lopts) + } + + def init = { + for (i <- 0 until lopts.nthreads) { + if (i < Mat.hasCUDA) setGPU(i) + if (dds(i).omats.length > 1) dds(i).omats(1) = ones(mopts.dim, dds(i).omats(0).ncols) + dds(i).init + models(i).init(dds(i)) + updaters(i).init(models(i)) + } + if (0 < Mat.hasCUDA) setGPU(0) + } + + def run = learner.run +} + + +class LearnFParModelx( + val ds:DataSource, + val mopts:Model.Options, + mkmodel:(Model.Options)=>Model, + val uopts:Updater.Options, + mkupdater:(Updater.Options)=>Updater) { + var models:Array[Model] = null + var updaters:Array[Updater] = null + var learner:ParLearnerx = null + var lopts = new Learner.Options + + def setup = { + models = new Array[Model](lopts.nthreads) + updaters = new Array[Updater](lopts.nthreads) + ds.init + for (i <- 0 until lopts.nthreads) { + if (i < Mat.hasCUDA) setGPU(i) + models(i) = mkmodel(mopts) + models(i).init(ds) + updaters(i) = mkupdater(uopts) + updaters(i).init(models(i)) + } + if (0 < Mat.hasCUDA) setGPU(0) + learner = new ParLearnerx(ds, models, null, updaters, lopts) + } + + def init = { + ds.omats(1) = ones(mopts.dim, ds.omats(0).ncols) + for (i <- 0 until lopts.nthreads) { + if (i < Mat.hasCUDA) setGPU(i) + if (ds.omats.length > 1) + ds.init + models(i).init(ds) + updaters(i).init(models(i)) + } + if (0 < Mat.hasCUDA) setGPU(0) + } + def run = learner.run +} + +object Learner { + + class Options { + var npasses = 10 + var evalStep = 15 + var syncStep = 32 + var nthreads = 4 + var pstep = 0.01f + var coolit = 60 + } + + def scoreSummary(reslist:ListBuffer[FMat], lasti:Int, length:Int):String = { + var i = lasti + var sum = 0.0 + while (i < length) { + sum += reslist(i)(0) + i += 1 + } + ("ll=%5.3f" format sum/(length-lasti)) + } + + def scores2FMat(reslist:ListBuffer[FMat]):FMat = { + val out = FMat(reslist(0).length, reslist.length) + var i = 0 + while (i < reslist.length) { + out(?, i) = reslist(i) + i += 1 + } + out + } +} + diff --git a/src/main/scala/BIDMach/Model.scala b/src/main/scala/BIDMach/Model.scala new file mode 100755 index 00000000..8e166fa9 --- /dev/null +++ b/src/main/scala/BIDMach/Model.scala @@ -0,0 +1,69 @@ +package BIDMach +import BIDMat.{Mat,BMat,CMat,CSMat,DMat,FMat,GMat,GIMat,GSMat,HMat,IMat,SMat,SDMat} +import BIDMat.MatFunctions._ +import BIDMat.SciFunctions._ + +abstract class Model(val opts:Model.Options = new Model.Options) { + + var modelmats:Array[Mat] = null + + var updatemats:Array[Mat] = null + + var mats:Array[Mat] = null + + var gmats:Array[Mat] = null + + var useGPU = false + + def init(datasource:DataSource):Unit = { + mats = datasource.next + datasource.reset + useGPU = opts.useGPU && Mat.hasCUDA > 0 + if (useGPU) { + gmats = new Array[Mat](mats.length) + for (i <- 0 until mats.length) { + gmats(i) = mats(i) match { + case aa:FMat => GMat(aa) + case aa:SMat => GSMat(aa) + } + } + } else { + gmats = mats + } + } + + def doblock(mats:Array[Mat], i:Long) // Calculate an update for the updater + + def evalblock(mats:Array[Mat]):FMat // Scores (log likelihoods) + + def doblockg(amats:Array[Mat], i:Long) = { + if (useGPU) copyMats(amats, gmats) + doblock(gmats, i) + if (useGPU && opts.putBack >= 0) amats(opts.putBack) <-- gmats(opts.putBack) + } + + def evalblockg(amats:Array[Mat]):FMat = { + if (useGPU) copyMats(amats, gmats) + val v = evalblock(gmats) + if (useGPU && opts.putBack >= 0) amats(opts.putBack) <-- gmats(opts.putBack) + v + } + + def copyMats(from:Array[Mat], to:Array[Mat]) = { + for (i <- 0 until from.length) { + to(i) = to(i) <-- from(i) + } +} +} + + +object Model { + class Options { + var nzPerColumn:Int = 0 + var startBlock = 8000 + var useGPU = true + var putBack = -1 + var doubleScore = false + var dim = 256 + } +} diff --git a/src/main/scala/BIDMach/NMFModel.scala b/src/main/scala/BIDMach/NMFModel.scala new file mode 100755 index 00000000..115bfe7a --- /dev/null +++ b/src/main/scala/BIDMach/NMFModel.scala @@ -0,0 +1,138 @@ +package BIDMach + +import BIDMat.{Mat,BMat,CMat,DMat,FMat,IMat,HMat,GMat,GIMat,GSMat,SMat,SDMat} +import BIDMat.MatFunctions._ +import BIDMat.SciFunctions._ + + +class NMFModel(opts:NMFModel.Options = new NMFModel.Options) extends FactorModel(opts) { + + var mm:Mat = null + var mdiag:Mat = null + var udiag:Mat = null + + override def init(datasource:DataSource) = { + super.init(datasource) + mm = modelmats(0) + modelmats = new Array[Mat](2) + modelmats(0) = mm + modelmats(1) = mm.zeros(mm.nrows, mm.ncols) + updatemats = new Array[Mat](2) + updatemats(0) = mm.zeros(mm.nrows, mm.ncols) + updatemats(1) = mm.zeros(mm.nrows, mm.ncols) + udiag = mkdiag(opts.uprior*ones(opts.dim,1)) + mdiag = mkdiag(opts.mprior*ones(opts.dim,1)) + if (useGPU) { + udiag = GMat(udiag) + mdiag = GMat(mdiag) + } + } + + override def uupdate(sdata:Mat, user:Mat) = { + val modeldata = mm * sdata + val mmu = mm *^ mm + udiag + for (i <- 0 until opts.uiter) { + val quot = modeldata / (mmu * user) + min(10.0f, max(0.1f, quot, quot), quot) + user ~ user *@ quot + max(opts.minuser, user, user) + } + } + + override def mupdate(sdata:Mat, user:Mat):Unit = { + val uu = user *^ user + mdiag *@ (1.0f*size(user,2)/opts.nusers) + updatemats(0) ~ (user *^ sdata) *@ mm + updatemats(1) ~ uu * mm + max(updatemats(1), opts.NMFeps, updatemats(1)) + } + + override def mupdate2(sdata:Mat, user:Mat):Unit = { + val uu = user *^ user + mdiag *@ (1.0f*size(user,2)/opts.nusers) + updatemats(0) ~ user *^ sdata + updatemats(1) ~ uu * mm + } + + override def evalfun(sdata:Mat, user:Mat):FMat = { + if (opts.doubleScore) { + evalfunx(sdata, user) + } else { + val modeldata = mm * sdata + val uu = user *^ user + mdiag *@ (1.0f*size(user,2)/opts.nusers) + val mmm = mm *^ mm + + val ll0 = sdata.contents ddot sdata.contents + val ll1 = modeldata ddot user + val ll2 = uu ddot mmm + val v1 = (-ll0 + 2*ll1 - ll2)/sdata.nnz + val v2 = -opts.uprior*(user ddot user)/sdata.nnz + row(v1,v2) + } + } + + def evalfunx(sdata0:Mat, user0:Mat):FMat = { + val sdata = SDMat(sdata0) + val user = DMat(user0) + val mmf = DMat(mm) + val mdiagf = DMat(mdiag) + + val modeldata = mmf * sdata + val uu = user *^ user + mdiagf *@ (1.0f*size(user,2)/opts.nusers) + val mmm = mmf *^ mmf + + val ll0 = sdata.contents ddot sdata.contents + val ll1 = modeldata ddot user + val ll2 = uu ddot mmm + val v1 = (-ll0 + 2*ll1 - ll2)/sdata.nnz + val v2 = -opts.uprior*(user ddot user)/sdata.nnz + row(v1,v2) + } +} + +object NMFModel { + class Options extends FactorModel.Options { + var NMFeps = 1e-12 + var uprior = 0.01f + var mprior = 1e-4f + var nusers = 100000 + } + + def mkNMFmodel(fopts:Model.Options) = { + new NMFModel(fopts.asInstanceOf[NMFModel.Options]) + } + + def mkUpdater(nopts:Updater.Options) = { + new IncNormUpdater(nopts.asInstanceOf[IncNormUpdater.Options]) + } + + def learn(mat0:Mat) = { + new Learner(new MatDataSource(Array(mat0:Mat)), new NMFModel(), null, new IncNormUpdater(), new Learner.Options) + } + + def learnBatch(mat0:Mat) = { + new Learner(new MatDataSource(Array(mat0:Mat)), new NMFModel(), null, new BatchNormUpdater(), new Learner.Options) + } + + def learnFPar( + nstart:Int=FilesDataSource.encodeDate(2012,3,1,0), + nend:Int=FilesDataSource.encodeDate(2012,12,1,0) + ) = { + new LearnFParModel( + new NMFModel.Options, mkNMFmodel _, + new IncNormUpdater.Options, mkUpdater _, + (n:Int, i:Int) => SFilesDataSource.twitterWords(nstart, nend, n, i) + ) + } + + def learnFParx( + nstart:Int=FilesDataSource.encodeDate(2012,3,1,0), + nend:Int=FilesDataSource.encodeDate(2012,12,1,0) + ) = { + new LearnFParModelx( + SFilesDataSource.twitterWords(nstart, nend), + new NMFModel.Options, mkNMFmodel _, + new IncNormUpdater.Options, mkUpdater _) + } +} + + + diff --git a/src/main/scala/BIDMach/Regression.scala b/src/main/scala/BIDMach/Regression.scala new file mode 100755 index 00000000..b74dab19 --- /dev/null +++ b/src/main/scala/BIDMach/Regression.scala @@ -0,0 +1,55 @@ +package BIDMach + +import BIDMat.{Mat,BMat,CMat,DMat,FMat,IMat,HMat,GMat,GIMat,GSMat,SMat,SDMat} +import BIDMat.MatFunctions._ +import BIDMat.SciFunctions._ + +abstract class RegressionModel(override val opts:RegressionModel.Options) extends Model { + var targmap:Mat = null + var targets:Mat = null + var mask:Mat = null + + override def init(datasource:DataSource) = { + super.init(datasource) + useGPU = opts.useGPU && Mat.hasCUDA > 0 + val data0 = mats(0) + val m = size(data0, 1) + val d = opts.targmap.nrows + val sdat = (sum(data0,2).t + 0.5f).asInstanceOf[FMat] + val sp = sdat / sum(sdat) + println("initial perplexity=%f" format (sp ddot ln(sp)) ) + + val rmat = rand(d,m) + rmat ~ rmat *@ sdat + val msum = sum(rmat, 2) + rmat ~ rmat / msum + val mm = rmat + modelmats = Array[Mat](1) + modelmats(0) = if (useGPU) GMat(mm) else mm + updatemats = new Array[Mat](1) + updatemats(0) = modelmats(0).zeros(mm.nrows, mm.ncols) + targets = if (useGPU) GMat(opts.targets) else opts.targets + targmap = if (useGPU) GMat(opts.targmap) else opts.targmap + mask = if (useGPU) GMat(opts.mask) else opts.mask + } + + def mupdate(data:Mat):FMat + + def doblock(gmats:Array[Mat], i:Long) = { + val sdata = gmats(0) + mupdate(sdata) + } + + def evalblock(mats:Array[Mat]):FMat = { + val sdata = gmats(0) + mupdate(sdata) + } +} + +object RegressionModel { + class Options extends Model.Options { + var targets:FMat = null + var targmap:FMat = null + var mask:FMat = null + } +} diff --git a/src/main/scala/BIDMach/Regularizer.scala b/src/main/scala/BIDMach/Regularizer.scala new file mode 100755 index 00000000..105b24d3 --- /dev/null +++ b/src/main/scala/BIDMach/Regularizer.scala @@ -0,0 +1,38 @@ +package BIDMach +import BIDMat.{Mat,BMat,CMat,DMat,FMat,IMat,HMat,GMat,GIMat,GSMat,SMat,SDMat} +import BIDMat.MatFunctions._ +import BIDMat.SciFunctions._ + +abstract class Regularizer(val opts:Regularizer.Options = new Regularizer.Options) { + val options = opts + var modelmats:Array[Mat] = null + var updatemats:Array[Mat] = null + + def compute(step:Float) + + def init(model:Model) = { + modelmats = model.modelmats + updatemats = model.updatemats + } +} + +class L1Regularizer(override val opts:Regularizer.Options = new Regularizer.Options) extends Regularizer(opts) { + def compute(step:Float) = { + for (i <- 0 until modelmats.length) { + updatemats(i) ~ updatemats(i) + (sign(modelmats(i)) * (-step*options.mprior)) + } + } +} + +class L2Regularizer(override val opts:Regularizer.Options = new Regularizer.Options) extends Regularizer(opts) { + def compute(step:Float) = { + for (i <- 0 until modelmats.length) { + updatemats(i) ~ updatemats(i) + (modelmats(i) * (-options.mprior * step)) + } + } +} + +object Regularizer { + class Options { + var mprior:FMat = 1e-7f } +} diff --git a/src/main/scala/BIDMach/Sampler.scala b/src/main/scala/BIDMach/Sampler.scala new file mode 100755 index 00000000..7cb9d3b2 --- /dev/null +++ b/src/main/scala/BIDMach/Sampler.scala @@ -0,0 +1,22 @@ +package BIDMach +import BIDMat.{Mat,BMat,CMat,CSMat,DMat,FMat,GMat,GIMat,GSMat,HMat,IMat,SMat,SDMat} +import BIDMat.MatFunctions._ +import BIDMat.SciFunctions._ + +abstract class Sampler { + + val options:Sampler.Options + + def insample(pos:Int, modelnum:Int):Int = 1 + + def outsample(mat:Mat):Unit = {} + + +} + + +object Sampler { + class Options { + + } +} diff --git a/src/main/scala/BIDMach/TestLearner.scala b/src/main/scala/BIDMach/TestLearner.scala new file mode 100755 index 00000000..e081906b --- /dev/null +++ b/src/main/scala/BIDMach/TestLearner.scala @@ -0,0 +1,153 @@ +package BIDMach +import BIDMat.{Mat,BMat,CMat,DMat,FMat,IMat,HMat,GMat,GIMat,GSMat,SMat,SDMat} +import BIDMat.MatFunctions._ +import BIDMat.SciFunctions._ +import BIDMat.Plotting._ + + +object TestLearner { +/* + def runLDALearner(rt:SMat, rtest:SMat, ndims:Int, nthreads:Int, useGPU:Boolean):Learner = { + +// Mat.numThreads = 1 + val model = new LDAmodel() + model.options.dim = ndims + model.options.uiter = 4 + model.options.uprior = 1e-1f + model.options.mprior = 1e0f + model.options.minuser = 1e-7f + model.options.nzPerColumn = 400 + model.options.useGPU = useGPU + + val updater = new MultUpdater + updater.options.alpha = 0.3f +// val updater = new MultUpdater(model) +// updater.options.alpha = 0.1f + updater.options.initnsteps = 8000f + + val learner = Learner(rt, null, rtest, null, model, null, updater) + learner.options.npasses = 20 + learner.options.secprint = 100 + learner.options.blocksize = 8000 //size(rt,2) + learner.options.numGPUthreads = nthreads + learner.run + learner + } + + def runNMFLearner(rt:SMat, rtest:SMat, ndims:Int, nthreads:Int, useGPU:Boolean):Learner = { + val model = new NMFmodel() + model.options.dim = ndims + model.options.uiter = 4 + model.options.uprior = 1e-4f + model.options.mprior = 1e2f + model.options.minuser = 1e-8f + model.options.nzPerColumn = 400 + model.options.useGPU = useGPU + + val updater = new MultUpdater + updater.options.alpha = 0.1f +// val updater = new MultUpdater(model) +// updater.options.alpha = 0.1f + updater.options.initnsteps = 8000f + + val learner = Learner(rt, null, rtest, null, model, null, updater) + learner.options.npasses = 10 + learner.options.secprint = 100 + learner.options.blocksize = 16000/nthreads //size(rt,2)//40000 // + learner.options.numGPUthreads = nthreads + learner.run + learner + } + + def runLogLearner(rt:SMat, st:FMat, rtest:SMat, stest:FMat):Learner = { + val model = new LogisticModel() + model.options.useGPU = false + + val regularizer = new L1Regularizer(model) + regularizer.options.mprior = 1e-7f + + val updater = new ADAGradUpdater + updater.options.alpha = 300f + updater.options.gradwindow = 1e6f + + val learner = Learner(rt, st > 4, rtest, stest > 4, model, regularizer, updater) + learner.options.npasses = 20 + learner.run + learner + } + + def runLinLearner(rt:SMat, st:FMat, rtest:SMat, stest:FMat):Learner = { + val model = new LinearRegModel() { + override def regfn(targ:Mat, pred:Mat, lls:Mat, gradw:Mat):Unit = linearMap1(targ, pred, lls, gradw) + } + model.options.nzPerColumn = 400 + model.options.transpose = false + model.options.useGPU = false + + val regularizer = new L1Regularizer(model) + regularizer.options.mprior = 1e-6f + + val updater = new ADAGradUpdater { override def update(step:Int):Unit = update1(step) } + // regularizer.options.beta = 1e-7f + updater.options.alpha = 200f + updater.options.gradwindow = 1e6f + + val learner = Learner(rt, st, rtest, stest, model, regularizer, updater) + learner.options.npasses = 10 + learner.options.secprint = 100 + learner.run + learner + } + + def runtest(dirname:String, ntest:Int, ndims:Int, nthreads:Int, useGPU:Boolean):Learner = { + tic + val revtrain:SMat = load(dirname+"xpart1.mat", "revtrain") + val revtest:SMat = load(dirname+"xpart1.mat", "revtest") + val t1 = toc; tic + val rt = revtrain(0->4000,0->(8000*(size(revtrain,2)/8000))) + val rtest = revtest(0->4000,0->(8000*(size(revtest,2)/8000))) + val scrtrain:IMat = load(dirname+"xpart1.mat", "scrtrain") + val scrtest:IMat = load(dirname+"xpart1.mat", "scrtest") + val st = FMat(scrtrain).t + val stest = (FMat(scrtest).t)(?,0->(8000*(size(revtest,2)/8000))) + val t2 = toc + println("Reading time=%3.2f+%3.2f seconds" format (t1,t2)) + val ntargs = ndims + val stt = zeros(ntargs, size(st,2)) + val sttest = zeros(ntargs, size(stest,2)) + for (i<-0 until size(stt,1)) {stt(i,?) = st; sttest(i,?) = stest} + flip + val learner:Learner = ntest match { + case 1 => runLinLearner(rt, stt, rtest, sttest) + case 2 => runLogLearner(rt, stt, rtest, sttest) + case 3 => runNMFLearner(rt , rtest, ndims, nthreads, useGPU) + case 4 => runLDALearner(rt , rtest, ndims, nthreads, useGPU) + } + val (ff, tt) = gflop + println("Time=%5.3f, gflops=%3.2f" format (tt, ff)) + val xvals = irow(1->(learner.tscores.size+1)) + val tscores = learner.tscores + val tscorex = learner.tscorex + val tsteps = learner.tsteps + val timeplot = semilogy(xvals, drow(tscores), xvals, drow(tscorex)) + val stepplot = semilogy(drow(tsteps), drow(learner.tscores), drow(tsteps), drow(tscorex)) +// val userhist = hist(log10(FMat(targetmat)(?)),100) + timeplot.setTitle("Neg. log likelihood vs time in seconds") + stepplot.setTitle("Neg. log likelihood vs number of samples") + val modelhist = hist(log10(FMat(learner.model.modelmat)(?)),100) +// val userhist = hist(log10(FMat(learner.targetmat)(?)),100) + learner + } + + + def main(args: Array[String]): Unit = { + val dirname = args(0) + val ntest = args(1).toInt + val ndims = args(2).toInt + val nthreads = args(3).toInt + val useGPU = args(4).toBoolean + + Mat.checkCUDA + runtest(dirname, ntest, ndims, nthreads, useGPU) + } */ +} diff --git a/src/main/scala/BIDMach/Updater.scala b/src/main/scala/BIDMach/Updater.scala new file mode 100755 index 00000000..1e4632ae --- /dev/null +++ b/src/main/scala/BIDMach/Updater.scala @@ -0,0 +1,391 @@ +package BIDMach + +import BIDMat.{Mat,BMat,CMat,DMat,FMat,IMat,HMat,GMat,GIMat,GSMat,SMat,SDMat} +import BIDMat.MatFunctions._ +import BIDMat.SciFunctions._ + + +abstract class Updater(val opts:Updater.Options = new Updater.Options) { + var model:Model = null + + def init(model0:Model) = { + model = model0 + } + + def update(step:Long):Unit + def updateM():Unit = {} + def clear():Unit = {} +} + + +class IncNormUpdater(override val opts:IncNormUpdater.Options = new IncNormUpdater.Options) extends Updater(opts) { + + var firstStep = 0f + var rm:Mat = null + var restart:Mat = null + var started:Int = 0 + + override def init(model0:Model) = { + super.init(model0) + val modelmats = model0.modelmats + val updatemats = model0.updatemats + restart = modelmats(0) + 1f + rm = model0.modelmats(0).zeros(1,1) + firstStep = 0f + } + + def update(step:Long) = { + val modelmats = model.modelmats + val updatemats = model.updatemats + val mm = modelmats(0) + val um = updatemats(0) + val rr = if (step == 0) 0.99f else { + if (firstStep == 0f) { + firstStep = step + 0.99f + } else { + math.pow(firstStep / step, opts.power).toFloat + } + } + if (modelmats.length > 1) { + val ms = modelmats(1) + val ums = updatemats(1) +// println("ums0 %g %g %g" format (rr, mini(mini(ums,1),2).dv, maxi(maxi(ums,1),2).dv)) + ums ~ ums *@ rm.set(rr) +// println("ums1 %g %g %g" format (rr, mini(mini(ums,1),2).dv, maxi(maxi(ums,1),2).dv)) + ms ~ ms *@ rm.set(1-rr) +// println("ums2 %g %g %g" format (rr, mini(mini(ums,1),2).dv, maxi(maxi(ums,1),2).dv)) + ms ~ ms + ums +// println("ums3 %g %g %g" format (rr, mini(mini(ums,1),2).dv, maxi(maxi(ums,1),2).dv)) + um ~ um / ms +// println("um %g %g" format (mini(mini(um,1),2).dv, maxi(maxi(um,1),2).dv)) + } + um ~ um *@ rm.set(rr) + mm ~ mm *@ rm.set(1-rr) + mm ~ mm + um + mm ~ mm / sum(mm,2) + if (opts.warmup > 0) { + if (started == 0 && step > opts.warmup) { + restart <-- mm + started = 1 + } + if (started == 1 && step > 2*opts.warmup) { + mm ~ mm - restart + max(mm, 0f, mm) + mm ~ mm / sum(mm,2) + started = 2 + } + } + } + + override def clear() = { + firstStep = 0f + } +} + +class BatchNormUpdater(override val opts:BatchNormUpdater.Options = new BatchNormUpdater.Options) extends Updater { + var accumulators:Array[Mat] = null + + override def init(model0:Model) = { + super.init(model0) + val modelmats = model.modelmats + val updatemats = model.updatemats + accumulators = new Array[Mat](updatemats.length) + for (i <- 0 until accumulators.length) { + accumulators(i) = updatemats(i).zeros(updatemats(i).nrows, updatemats(i).ncols) + } + } + + def update(step:Long) = { + val updatemats = model.updatemats + for (i <- 0 until accumulators.length) { + accumulators(i) ~ accumulators(i) + updatemats(i) + } + } + + override def clear() = { + for (i <- 0 until accumulators.length) { + accumulators(i).clear + } + } + + override def updateM():Unit = { + val mm = model.modelmats(0) + mm ~ accumulators(0) / accumulators(1) + mm ~ mm / sum(mm,2) + clear + } +} + + +class IncMultUpdater(override val opts:IncMultUpdater.Options = new IncMultUpdater.Options) extends Updater { + + var firstStep = 0f + var rm:Mat = null + + override def init(model0:Model) = { + super.init(model0) + rm = model0.modelmats(0).zeros(1,1) + } + + def update(step:Long) = { + val modelmats = model.modelmats + val updatemats = model.updatemats + val mm = modelmats(0) + val ms = modelmats(1) + val um = updatemats(0) + val ums = updatemats(1) + val rr = if (step == 0) 1f else { + if (firstStep == 0f) { + firstStep = step + 1f + } else { + math.pow(firstStep / step, opts.power).toFloat + } + } +// println("rr=%g, %g %g" format (rr, mini(mini(um,1),2).dv, maxi(maxi(um,1),2).dv)) + um ~ um *@ rm.set(rr) +// println("rr=%g, %g %g" format (rr, mini(mini(um,1),2).dv, maxi(maxi(um,1),2).dv)) + ln(mm, mm) +// println("mm=%g %g" format (mini(mini(mm,1),2).dv, maxi(maxi(mm,1),2).dv)) + mm ~ mm *@ rm.set(1-rr) +// println("mm=%g %g" format (mini(mini(mm,1),2).dv, maxi(maxi(mm,1),2).dv)) + mm ~ mm + um +// println("mm=%g %g" format (mini(mini(mm,1),2).dv, maxi(maxi(mm,1),2).dv)) + exp(mm, mm) +// println("mm=%g %g" format (mini(mini(mm,1),2).dv, maxi(maxi(mm,1),2).dv)) + mm ~ mm / sum(mm,2) + } + + override def clear() = { + firstStep = 0f + } +} + +class TelescopingUpdater(override val opts:TelescopingUpdater.Options = new TelescopingUpdater.Options) extends Updater { + var accumulators:Array[Mat] = null + var firstStep = 0L + var nextStep = 10L + var nextCount = 0L + var rm:Mat = null + + override def init(model0:Model) = { + super.init(model0) + val modelmats = model0.modelmats + val updatemats = model0.updatemats + rm = model0.modelmats(0).zeros(1,1) + accumulators = new Array[Mat](updatemats.length) + for (i <- 0 until updatemats.length) { + accumulators(i) = updatemats(i).zeros(updatemats(i).nrows, updatemats(i).ncols) + } + firstStep = 0L + nextStep = 10L + nextCount = 0L + } + + def update(step:Long) = { + if (firstStep == 0 && step > 0) { + firstStep = step + } + val updatemats = model.updatemats + for (i <- 0 until updatemats.length) { + accumulators(i) ~ accumulators(i) + updatemats(i) + } + if (step >= nextCount) { + model.modelmats(0) ~ accumulators(0) / accumulators(1) + nextStep = (nextStep * opts.factor).toLong + nextCount = step + nextStep + } + } + + override def clear() = { + for (i <- 0 until accumulators.length) { + accumulators(i).clear + } + } +} + + +class GradUpdater(override val opts:GradUpdater.Options = new GradUpdater.Options) extends Updater { + + var firstStep = 0f + var modelmat:Mat = null + var updatemat:Mat = null + var sumSq:Mat = null + var stepn:Mat = null + var mask:Mat = null + var ve:Mat = null + var te:Mat = null + var alpha:Mat = null + + override def init(model0:Model) = { + model = model0 + modelmat = model.modelmats(0) + updatemat = model.updatemats(0) + mask = opts.mask + stepn = modelmat.zeros(1,1) + te = modelmat.zeros(opts.timeExponent.nrows, opts.timeExponent.ncols) + alpha = modelmat.zeros(opts.alpha.nrows, opts.alpha.ncols) + te <-- opts.timeExponent + alpha <-- opts.alpha + } + + def update(step:Long):Unit = { + val nsteps = if (step == 0) 1f else { + if (firstStep == 0f) { + firstStep = step + 1f + } else { + step / firstStep + } + } + stepn.set(1f/nsteps) + if (opts.waitsteps < nsteps) { + val tmp = updatemat *@ (alpha *@ (stepn ^ te)) + modelmat ~ modelmat + tmp + if (mask != null) modelmat ~ modelmat *@ mask + } + } +} + + +class ADAGradUpdater(override val opts:ADAGradUpdater.Options = new ADAGradUpdater.Options) extends Updater { + + var firstStep = 0f + var modelmat:Mat = null + var updatemat:Mat = null + var sumSq:Mat = null + var stepn:Mat = null + var mask:Mat = null + var ve:Mat = null + var te:Mat = null + var alpha:Mat = null + + override def init(model0:Model) = { + model = model0 + modelmat = model.modelmats(0) + updatemat = model.updatemats(0) + mask = opts.mask + if (sumSq.asInstanceOf[AnyRef] == null) { + sumSq = modelmat.ones(size(modelmat,1), size(modelmat,2)) *@ opts.initsumsq + } else { + sumSq.set(opts.initsumsq) + } + stepn = modelmat.zeros(1,1) + ve = modelmat.zeros(opts.vecExponent.nrows, opts.vecExponent.ncols) + te = modelmat.zeros(opts.timeExponent.nrows, opts.timeExponent.ncols) + alpha = modelmat.zeros(opts.alpha.nrows, opts.alpha.ncols) + ve <-- opts.vecExponent + te <-- opts.timeExponent + alpha <-- opts.alpha + } + + def update2(step:Long):Unit = { + val nsteps = if (step == 0) 1f else { + if (firstStep == 0f) { + firstStep = step + 1f + } else { + step / firstStep + } + } + stepn.set(nsteps) + val nw = 1f / stepn + val newsquares = updatemat *@ updatemat + newsquares ~ newsquares *@ nw + sumSq ~ sumSq *@ (1f - nw) + sumSq ~ sumSq + newsquares + if (opts.waitsteps < nsteps) { + val tmp = sumSq ^ ve + tmp ~ tmp *@ (stepn ^ te) + tmp ~ tmp + opts.epsilon + modelmat ~ modelmat + ((updatemat / tmp) *@ alpha) + if (mask != null) modelmat ~ modelmat *@ mask + } + } + + def update(step:Long):Unit = { + val nsteps = if (step == 0) 1f else { + if (firstStep == 0f) { + firstStep = step + 1f + } else { + step / firstStep + } + } + stepn.set(nsteps) + val nw = 1f / stepn + val newsquares = updatemat *@ updatemat + newsquares ~ newsquares *@ nw + sumSq ~ sumSq *@ (1f - nw) + sumSq ~ sumSq + newsquares + if (opts.waitsteps < nsteps) { + val tmp = sumSq ^ ve + tmp ~ tmp *@ (stepn ^ te) + tmp ~ tmp + opts.epsilon + tmp ~ updatemat / tmp + tmp ~ tmp *@ alpha + modelmat ~ modelmat + tmp + if (mask != null) modelmat ~ modelmat *@ mask + } + } +} + + + +object IncNormUpdater { + class Options extends Updater.Options { + var warmup = 0L + var power = 0.9f + } +} + +object IncMultUpdater { + class Options extends Updater.Options { + var warmup = 0L + var power = 0.9f + } +} + +object BatchNormUpdater { + class Options extends Updater.Options { + + } +} + +object BatchMultUpdater { + class Options extends Updater.Options { + var eps = 1e-12 + + } +} + +object TelescopingUpdater { + class Options extends Updater.Options { + val factor = 1.5f + } +} + +object GradUpdater { + class Options extends Updater.Options { + var alpha:FMat = 1f + var timeExponent:FMat = 0.5f + var waitsteps = 2 + var mask:FMat = null + } +} + + +object ADAGradUpdater { + class Options extends GradUpdater.Options { + var vecExponent:FMat = 0.5f + var epsilon = 1e-15f + var initsumsq = 1e-8f + } +} + +object Updater { + class Options { + + } +}