本实验要求使用一切可能的办法来优化程序(附:实验文件完整代码)。

rotate

这就是一个矩阵转置的问题。只要让内部循环得以连续写入就会得到很大的性能提升。经过我的实验发现,循环所需的性能消耗可以忽略不计,所以如果只是单纯地加大循环步长并没有获得什么性能提升。毕竟读取内存消耗的资源比CPU计算消耗的资源不知道要高到哪里去了。

char rotate_descr[] = "rotate: Current working version";
void rotate(int dim, pixel *src, pixel *dst) 
{
    int i, j;

    for (i = 0; i < dim; i+=32)
    for (j = 0; j < dim; j++) {
        dst[RIDX(dim-1-j, i, dim)] = src[RIDX(i, j, dim)];
        dst[RIDX(dim-1-j, i+1, dim)] = src[RIDX(i+1, j, dim)];
        dst[RIDX(dim-1-j, i+2, dim)] = src[RIDX(i+2, j, dim)];
        dst[RIDX(dim-1-j, i+3, dim)] = src[RIDX(i+3, j, dim)];
        dst[RIDX(dim-1-j, i+4, dim)] = src[RIDX(i+4, j, dim)];
        dst[RIDX(dim-1-j, i+5, dim)] = src[RIDX(i+5, j, dim)];
        dst[RIDX(dim-1-j, i+6, dim)] = src[RIDX(i+6, j, dim)];
        dst[RIDX(dim-1-j, i+7, dim)] = src[RIDX(i+7, j, dim)];
        dst[RIDX(dim-1-j, i+8, dim)] = src[RIDX(i+8, j, dim)];
        dst[RIDX(dim-1-j, i+9, dim)] = src[RIDX(i+9, j, dim)];
        dst[RIDX(dim-1-j, i+10, dim)] = src[RIDX(i+10, j, dim)];
        dst[RIDX(dim-1-j, i+11, dim)] = src[RIDX(i+11, j, dim)];
        dst[RIDX(dim-1-j, i+12, dim)] = src[RIDX(i+12, j, dim)];
        dst[RIDX(dim-1-j, i+13, dim)] = src[RIDX(i+13, j, dim)];
        dst[RIDX(dim-1-j, i+14, dim)] = src[RIDX(i+14, j, dim)];
        dst[RIDX(dim-1-j, i+15, dim)] = src[RIDX(i+15, j, dim)];
        dst[RIDX(dim-1-j, i+16, dim)] = src[RIDX(i+16, j, dim)];
        dst[RIDX(dim-1-j, i+17, dim)] = src[RIDX(i+17, j, dim)];
        dst[RIDX(dim-1-j, i+18, dim)] = src[RIDX(i+18, j, dim)];
        dst[RIDX(dim-1-j, i+19, dim)] = src[RIDX(i+19, j, dim)];
        dst[RIDX(dim-1-j, i+20, dim)] = src[RIDX(i+20, j, dim)];
        dst[RIDX(dim-1-j, i+21, dim)] = src[RIDX(i+21, j, dim)];
        dst[RIDX(dim-1-j, i+22, dim)] = src[RIDX(i+22, j, dim)];
        dst[RIDX(dim-1-j, i+23, dim)] = src[RIDX(i+23, j, dim)];
        dst[RIDX(dim-1-j, i+24, dim)] = src[RIDX(i+24, j, dim)];
        dst[RIDX(dim-1-j, i+25, dim)] = src[RIDX(i+25, j, dim)];
        dst[RIDX(dim-1-j, i+26, dim)] = src[RIDX(i+26, j, dim)];
        dst[RIDX(dim-1-j, i+27, dim)] = src[RIDX(i+27, j, dim)];
        dst[RIDX(dim-1-j, i+28, dim)] = src[RIDX(i+28, j, dim)];
        dst[RIDX(dim-1-j, i+29, dim)] = src[RIDX(i+29, j, dim)];
        dst[RIDX(dim-1-j, i+30, dim)] = src[RIDX(i+30, j, dim)];
        dst[RIDX(dim-1-j, i+31, dim)] = src[RIDX(i+31, j, dim)];
    }
}

 smooth

 这个函数用来模糊图像用的,原来的函数实现里做了太多的重复计算,所以我的基本思想就是把那些可以重复利用的计算结果。

例如有这么一个图像:

[0][0] [0][1] [0][2] [0][3]
[1][0] [1][1] [1][2] [1][3]
[2][0] [2][1] [2][2] [2][3]

在处理[1][1]的时候,我们需要计算[1][1]附近的9个点的和,不难发现,如果在处理[1][2]这个点的时候,第1列和第2列的的和还是可以继续用的。采用这个思想,可以解决重复计算的问题,不过优化的效果不是特别明显。

另外一个注意的地方就是避免函数调用其它函数,虽然这样很繁琐,但是对于性能有很大的提升。

/*
 * smooth - Your current working version of smooth. 
 * IMPORTANT: This is the version you will be graded on
 */
char smooth_descr[] = "smooth: Current working version";
void smooth(int dim, pixel *src, pixel *dst) 
{
    int i, j;
    pixel_sum col1, col2, col3;
    for (i = 0; i < dim; i++) {
        col2.red = col2.green = col2.blue = col2.num = 0;
        col3.red = col3.green = col3.blue = col3.num = 0;
        if (i < dim-1) {
            col3.red += src[RIDX(i+1, 0, dim)].red;
            col3.green += src[RIDX(i+1, 0, dim)].green;
            col3.blue += src[RIDX(i+1, 0, dim)].blue;
            col3.num++;
        }
        col3.red += src[RIDX(i, 0, dim)].red;
        col3.green += src[RIDX(i, 0, dim)].green;
        col3.blue += src[RIDX(i, 0, dim)].blue;
        col3.num++;
        if (i > 0) {
            col3.red += src[RIDX(i-1, 0, dim)].red;
            col3.green += src[RIDX(i-1, 0, dim)].green;
            col3.blue += src[RIDX(i-1, 0, dim)].blue;
            col3.num++;
        }
        for (j = 0; j < dim; j++) {
            col1 = col2;
            col2 = col3;
            col3.red = col3.green = col3.blue = col3.num = 0;
            if (j < dim-1) {
                col3.red += src[RIDX(i, j+1, dim)].red;
                col3.green += src[RIDX(i, j+1, dim)].green;
                col3.blue += src[RIDX(i, j+1, dim)].blue;
                col3.num++;
            }
            if (j < dim-1 && i > 0) {
                col3.red += src[RIDX(i-1, j+1, dim)].red;
                col3.green += src[RIDX(i-1, j+1, dim)].green;
                col3.blue += src[RIDX(i-1, j+1, dim)].blue;
                col3.num++;
            }
            if (j < dim-1 && i < dim-1) {
                col3.red += src[RIDX(i+1, j+1, dim)].red;
                col3.green += src[RIDX(i+1, j+1, dim)].green;
                col3.blue += src[RIDX(i+1, j+1, dim)].blue;
                col3.num++;
            }
            dst[RIDX(i, j, dim)].red = (unsigned short) ((col1.red + col2.red + col3.red)/(col1.num + col2.num + col3.num));
            dst[RIDX(i, j, dim)].green = (unsigned short) ((col1.green + col2.green + col3.green)/(col1.num + col2.num + col3.num));
            dst[RIDX(i, j, dim)].blue = (unsigned short) ((col1.blue + col2.blue + col3.blue)/(col1.num + col2.num + col3.num));
        }
    }
}