日月明

并行与分布式计算导论——作业(二)
Based on markdown 推荐使用markdown阅读器阅读,或访问 并行与分布式计算导论——作业二[T...
扫描右侧二维码阅读全文
09
2019/04

并行与分布式计算导论——作业(二)

该部分仅登录用户可见

Based on markdown 推荐使用markdown阅读器阅读,或访问 并行与分布式计算导论——作业二

[TOC]

作业要求

完成以下OpenMP程序,提交作业报告,并在报告的最后附源码,格式为PDF.
The Number of Inversion Pairs
Bucket Sort
报告的内容可以包括:
1.并行的核心代码或核心思想;
2.与参考串行/并行程序相比较,或使用不同的线程数量比较输出结果并简单分析;
3.使用OpenMP的心得或遇到的困难.

The Number of Inversion Pairs

结果比较(本机8代标压酷睿i7-6核心)

问题规模线程数时间(ms)线程数时间(ms)线程数时间(ms)线程数时间(ms)
$10^4$17274666
$10^6$12072122480669
$10^7$12.115(s)21.205(s)47776635
$10^8$123.900(s)213.568(s)48.183(s)66.593(s)

结果比较(临湖草堂)

由于未知原因...在临湖草堂测试时,逆序对数量输出不对,故本程序不做参考

并附上问题规模为$10^7$时,本机上的对比数据

最后一项为使用临湖草堂示例代码给出的结果,以证明并行代码的结论是正确的.

# rainshaw @ gaoruixiaodeMacBook-Pro-15 in ~/Desktop/Code/CLionProjects/Parallel&DistributedProgramming/homework2/InversionPairs/cmake-build-debug [11:44:02] 
$ time ./InversionPairs 10000000 6
25009750388965
./InversionPairs 10000000 6  2.14s user 0.04s system 342% cpu 0.635 total

# rainshaw @ gaoruixiaodeMacBook-Pro-15 in ~/Desktop/Code/CLionProjects/Parallel&DistributedProgramming/homework2/InversionPairs/cmake-build-debug [11:44:17] 
$ time ./InversionPairs 10000000 4 
25009750388965
./InversionPairs 10000000 4  2.10s user 0.03s system 274% cpu 0.777 total

# rainshaw @ gaoruixiaodeMacBook-Pro-15 in ~/Desktop/Code/CLionProjects/Parallel&DistributedProgramming/homework2/InversionPairs/cmake-build-debug [11:44:32] 
$ time ./InversionPairs 10000000 2
25009750388965
./InversionPairs 10000000 2  2.08s user 0.03s system 174% cpu 1.205 total

# rainshaw @ gaoruixiaodeMacBook-Pro-15 in ~/Desktop/Code/CLionProjects/Parallel&DistributedProgramming/homework2/InversionPairs/cmake-build-debug [11:44:45] 
$ time ./InversionPairs 10000000 1
25009750388965
./InversionPairs 10000000 1  2.08s user 0.03s system 99% cpu 2.115 total

# rainshaw @ gaoruixiaodeMacBook-Pro-15 in ~/Desktop/Code/CLionProjects/Parallel&DistributedProgramming/homework2/InversionPairs/cmake-build-debug [11:44:58] 
$ time ./InversionPairs 10000000 1
25009750388965
./InversionPairs 10000000 1  2.16s user 0.02s system 99% cpu 2.194 total

从表中可以看出,当问题规模达到$10^6$时,此时采用并行算法可以显著的降低程序运行时间.

并行代码与核心思想

并行核心代码如下

long long omp_merge_sort(int beg, int mid, int end){
    long long ret=0;
    int lpos=beg, rpos=mid;
    int i;
    for (i = beg; i <= end; ++i)
        tmp[i] = num[i];
    i=beg;

    while(lpos<mid && rpos<end){
        if (tmp[lpos] <= tmp[rpos])
            num[i++] = tmp[lpos++];
        else {
            num[i++] = tmp[rpos++];
            ret += mid - lpos;
        }
    }
    while (lpos < mid)
        num[i++] = tmp[lpos++];
    while (rpos < end)
        num[i++] = tmp[rpos++];
    return ret;
}

long long omp_merge(int N) {
    int i, j, ttmp;
    long long ret=0;
#pragma omp parallel for private(i, ttmp) reduction(+:ret)
    for (i = 0; i < N / 2; ++i) {
        if(*(num+i*2)>*(num+i*2+1)){
            ttmp=*(num+i*2);
            *(num+i*2)=*(num+i*2+1);
            *(num+i*2+1)=ttmp;
            ret++;
        }
    }

    for (i = 2; i < N; i*=2) {
#pragma omp parallel for private(j) shared(i) reduction(+:ret)
        for(j=0;j<N-i; j+=2*i){
            ret+=omp_merge_sort(j, j+i, j+2*i<N?j+2*i:N);
        }
    }

    return ret;

}

归并排序是一个自顶向下的分治算法,但自顶向下不好并行,所以采用自底向上的思想,先归并小区间,然后逐步增大归并区间长度.则同一层中的归并过程就可以并行了!

程序源码

#include <cstdio>
#include <random>
#include <omp.h>
using namespace std;
int *num, *tmp;
//int num[20000], tmp[20000];

long long merge(int beg, int end) {
    if (beg == end)
        return 0;
    int mid = (beg + end) >> 1, lpos = beg, rpos = mid + 1;
    long long ret = merge(beg, mid) + merge(mid + 1, end);
    int i;
    for (i = beg; i <= end; ++i)
        tmp[i] = num[i];
    i = beg;
    while (lpos <= mid && rpos <= end)
        if (tmp[lpos] <= tmp[rpos])
            num[i++] = tmp[lpos++];
        else {
            num[i++] = tmp[rpos++];
            ret += mid - lpos + 1;
        }
    while (lpos <= mid)
        num[i++] = tmp[lpos++];
    while (rpos <= end)
        num[i++] = tmp[rpos++];
    return ret;
}

long long omp_merge_sort(int beg, int mid, int end){
    long long ret=0;
    int lpos=beg, rpos=mid;
    int i;
    for (i = beg; i <= end; ++i)
        tmp[i] = num[i];
    i=beg;

    while(lpos<mid && rpos<end){
        if (tmp[lpos] <= tmp[rpos])
            num[i++] = tmp[lpos++];
        else {
            num[i++] = tmp[rpos++];
            ret += mid - lpos;
        }
    }
    while (lpos < mid)
        num[i++] = tmp[lpos++];
    while (rpos < end)
        num[i++] = tmp[rpos++];
    return ret;
}

long long omp_merge(int N) {
    int i, j, ttmp;
    long long ret=0;
#pragma omp parallel for private(i, ttmp) reduction(+:ret)
    for (i = 0; i < N / 2; ++i) {
        if(*(num+i*2)>*(num+i*2+1)){
            ttmp=*(num+i*2);
            *(num+i*2)=*(num+i*2+1);
            *(num+i*2+1)=ttmp;
            ret++;
        }
    }

    for (i = 2; i < N; i*=2) {
#pragma omp parallel for private(j) shared(i) reduction(+:ret)
        for(j=0;j<N-i; j+=2*i){
            ret+=omp_merge_sort(j, j+i, j+2*i<N?j+2*i:N);
        }
    }

    return ret;

}

int main(int argc, char* argv[]) {
    int N;
//    scanf("%d", &N);
    int threads;
    //scanf("%d",&threads);
    N=atoi(argv[1]);
    threads=atoi(argv[2]);


    num=new int[N+10];
    tmp=new int[N+10];

//    for (int i = 0; i < N; ++i)
//        scanf("%d", num + i);

    default_random_engine e;
    for(int i=0; i<N;++i)
        *(num+i)=e();

    for(int i=0; i<32;++i){
        num[e()%N]=num[e()%N];
    }

//    printf("%lld\n", merge(0, N - 1));

    omp_set_num_threads(threads);
    printf("%lld\n", omp_merge(N));

    return 0;
}

Bucket Sort

结果比较(本机8代标压酷睿i7-6核心)

问题规模每线程桶数线程数时间(ms)线程数时间(ms)线程数时间(ms)线程数时间(ms)
$10^4$1018284865
$10^6$101148291470656
$10^7$1011.533(s)291546436533
$10^8$100113.084(s)27.935(s)45.320(s)64.463(s)

结果比较(临湖草堂)

问题规模每线程桶数线程数时间(ms)线程数时间(ms)线程数时间(ms)线程数时间(ms)
$10^6$1011.8716e-0121.2269e-0138.3193e-0248.4673e-02
$10^7$100011.0209e+0027.0260e-013=5.7771e-0145.1213e-01

并附上4线程$dim=10^7,n_bucket=1000$时,临湖草堂上的对比数据

reference serial impl.  : time_cost=1.1017e+00                   res:The data is sorted.

reference parallel impl.: time_cost=4.4308e-01 speedup=2.487e+00 res:The data is sorted.

your impl.              : time_cost=5.1213e-01 speedup=2.151e+00 res:The data is sorted.

从两张表中可以看出,当问题规模较小时(小于$10^6$),采用并行算法并没有明显的增益,而当数据规模较大时,例如$10^8$,此时采用并行算法可以显著的降低程序运行时间.

并行核心代码

#pragma omp parallel for
    for (i = 0; i < n_buckets; i++)
        qsort(B + buckets[i].start, buckets[i].n_elem, sizeof(int), cmpfunc);

程序的其他部分也存在很多循环,但由于其中的部分数值无法原子更新导致循环无法并行化.

程序源码

#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <time.h>
#include <omp.h>

struct bucket {
    int n_elem;
    int index; // [start : n_elem)
    int start; //starting point in B array
};

int cmpfunc(const void * a, const void * b)
{
    return (*(int*)a - *(int*)b);
}

int main(int argc, char *argv[]) {

    int *A, *B, *temp;
    int dim, n_buckets, i, w, j, limit, num_threads;
    struct bucket *buckets; //array of buckets

    dim = atoi(argv[1]);
    n_buckets = atoi(argv[2]);
    num_threads = atoi(argv[3]);

    omp_set_num_threads(num_threads);
    limit = 10000000;
    w = (int) limit / n_buckets;
    A = (int *) malloc(sizeof(int) * dim);
    B = (int *) malloc(sizeof(int) * dim);

    buckets = (struct bucket *) calloc(n_buckets, sizeof(struct bucket));

    for (i = 0; i < dim; i++) {
        A[i] = random() % limit;
    }

    for (i = 0; i < dim; i++) {
        j = A[i] / w;
        if (j > n_buckets - 1)
            j = n_buckets - 1;
        buckets[j].n_elem++;
    }


    for (i = 1; i < n_buckets; i++) {
        buckets[i].index = buckets[i - 1].index + buckets[i - 1].n_elem;
        buckets[i].start = buckets[i - 1].start + buckets[i - 1].n_elem;
    }
    int b_index;

    for (i = 0; i < dim; i++) {
        j = A[i] / w;
        if (j > n_buckets - 1)
            j = n_buckets - 1;
        b_index = buckets[j].index++;
        B[b_index] = A[i];
    }

#pragma omp parallel for
    for (i = 0; i < n_buckets; i++)
        qsort(B + buckets[i].start, buckets[i].n_elem, sizeof(int), cmpfunc);

    temp = A;
    A = B;
    B = temp;

    int sorted = 1;
    for (i = 0; i < dim - 1; i++) {
        if (A[i] > A[i + 1])
            sorted = 0;
    }
    if (!sorted)
        printf("The data is not sorted!!!\n");
    else {
        printf("The data is sorted.\n");
    }
}
最后修改:2019 年 05 月 06 日 02 : 52 PM
如果觉得我的文章对你有用,请随意赞赏

发表评论