For example, if each leaf node of decision trees makes a regression class,
# of regress classes becomes tens of thousands. In the current HTK
implementation, out-product of an observation vector is first stored to
bDiagMat of RegAccs of all regression classes as follows:
for (b=1;b<=bclass->numClasses;b++) {
...
/* now update the outerproduct observation */
if (svec != NULL) {
nblock = (int)(ra->bDiagMat[0]);
for (bl=1, cnt=1; bl<=nblock;bl++){
bsize = TriMatSize(ra->bDiagMat[bl]);
m = ra->bDiagMat[bl];
for (i=1, cnti=cnt; i<=bsize; i++,cnti++) { /* Fill the outer product
*/
for (j=1,cntj=cnt; j<=i; j++,cntj++)
m[i][j] = svec[cnti]*svec[cntj];
}
cnt +=bsize;
}
}
}
Then they are stored to bTriMat of RegAccs of regression classes if its
bVector[1]>0 as follows:
if ((ra->bTriMat != NULL) && (ra->bVector[1]>0)) {
acc = ra->bVector;
nblock = (int)(ra->bDiagMat[0]);
for (bl=1,cnti=1;bl<=nblock;bl++) {
m = ra->bDiagMat[bl];
bsize = TriMatSize(m);
for (i=1;i<=bsize;i++,cnti++) { /* Fill the accumulate stores */
tm = ra->bTriMat[cnti];
for (j=1; j<=bsize; j++)
for (k=1; k<=j; k++)
tm[j][k] += m[j][k] * acc[cnti];
}
}
ZeroDVector(ra->bVector);
}
As you can see, it consumes a lot of computational costs. If # of
regression class is large, calculating and storing outer-product takes
long time. But most of bVector[0] are 0, so most of outer-product
computations are wasted.