N皇后问题的并行解法

2023-05-16

N皇后问题的并行解法

N皇后问题

其实就是一个n*n的棋盘上摆n个皇后，任意两个皇后不能同行，同列，同对角线，求出所有的摆法。这个问题可以看做是求n的组合数。比如第一列上的皇后在第x1行，第二列在x2行…..最后的一个解就是x1x2x3…xn。这是[1,n]的一个排列，求出全排列，然后去掉不符合的就是题目的解，而全排列的求解实际就是一个简答的DFS。但是这个方法太暴力了，实际上不少情况下不用摆到最后一列就知道这个解是无效的，这个时候就可以返回了。

单线程的解法

/*
singleThread.c
*/
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#define MAXN 100
#define true 1
#define false 0
inline int abs(int a) {
    return a >= 0 ? a : (-a); 
}
int P[MAXN],used[MAXN],count,n;

void generateP(int index) {//index指[1-index-1]列已经选好皇后，现在从index列开始递归
    int flag;
    if(index == n + 1) {
        count++;
        return;
    }
    for(int x = 1; x <= n; x++) { //第x行
        if(used[x] == false) { //第x行还没有皇后
            flag = true; //flag表示当前行的皇后是否会和之前的皇后冲突，此处只需要计算是否在一条对角线上就可以了
            for(int pre = 1; pre< index; pre++) {
                if(abs(index - pre) == abs(x - P[pre])) {
                    flag = false;
                    break;
                }
            }
            if(flag) {
                P[index] = x;
                used[x] = true;
                generateP(index + 1);
                used[x] = false;
            }
        }
    }
}
int main(int argc,char **argv)
{
    if(argc != 2) {
        fprintf(stderr,"error args\n");
        return -1;
    }
    struct timeval start,end;
    n = atoi(argv[1]);
    long long startusec,endusec;
    gettimeofday(&start,NULL);
    generateP(1);
    gettimeofday(&end,NULL);
    printf("result count is %d\n",count);
    startusec = start.tv_sec * 1000000 + start.tv_usec;
    endusec = end.tv_sec * 1000000 + end.tv_usec;
    printf("spend %.4f s\n", (endusec - startusec)/1000000.0);
    return 0;

}

编译后测试运行

wd@ub-linux:Nhuanghou$ ./a.out 8
result count is 92
spend 0.0002 s
wd@ub-linux:Nhuanghou$ ./a.out 9
result count is 352
spend 0.0008 s
wd@ub-linux:Nhuanghou$ ./a.out 10
result count is 724
spend 0.0084 s
wd@ub-linux:Nhuanghou$ ./a.out 11
result count is 2680
spend 0.0228 s
wd@ub-linux:Nhuanghou$ ./a.out 12
result count is 14200
spend 0.1086 s
wd@ub-linux:Nhuanghou$ ./a.out 13
result count is 73712
spend 0.6085 s
wd@ub-linux:Nhuanghou$ ./a.out 14
result count is 365596
spend 3.8334 s
wd@ub-linux:Nhuanghou$ ./a.out 15
result count is 2279184
spend 25.2267 s

可以发现程序求解的速度是呈指数上升的。16皇后的解，难得等，就没做。。。。

如果这个题用多线程做，应该是有性能提升的吧？

并行的解法

以第一列为分解，先预定好第一列的皇后位置，然后将剩下的任务分发到各线程。但是cpu的核数有限，我的机器是4核，所以创建n个线程是没有意义的，创建4个线程较好。所以应该有一种机制能管理任务的分发，当一个线程任务做完了，能自动去接受剩下的计算任务。这个其实用线程池就好。然后就是对单线程程序的简单修改，将一些全局数据私有化。比如单线程中的P[]和used[]，现在因为各个线程的计算任务并行执行，所以要为每个线程都设置P[]和used[]。（线程池的博客下次再写，会有详细注释）

上代码

#include "ThreadPool.h"
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>

#define MAXN 100
#define true 1
#define false 0

struct Result {
    int xUsed;
    int count;
};
inline int abs(int a) {
    return a >= 0 ? a : (-a); 
}

int n; 
__thread Result *re;
__thread int P[MAXN],used[MAXN];

void generateP(int index/*,int *pCount,int *P,int* used*/) {
    int flag;
    if(index == n + 1) {
        re->count++;
        return;
    }
    for(int x = 1; x <= n; x++) {
        if(used[x] == false) {
            flag = true;
            for(int pre = 1; pre< index; pre++) {
                if(abs(index - pre) == abs(x - P[pre])) {
                    flag = false;
                    break;
                }
            }
            if(flag) {
                P[index] = x;
                used[x] = true;
                generateP(index + 1/*,pCount,P,used*/);
                used[x] = false;
            }
        }
    }
}

void thrFun(void *arg) {

    struct timeval start,end;
    long long startusec,endusec;
    re = (Result*)arg;
    // __thread int P[MAXN],used[MAXN];
     memset(P,0,sizeof(int)*MAXN);
     memset(used,0,sizeof(int)*MAXN);
    P[1] = re->xUsed;
    used[re->xUsed] = true;

    gettimeofday(&start,NULL);
    generateP(2/*,&(re->count),P,used*/);
    gettimeofday(&end,NULL);
    startusec = start.tv_sec * 1000000 + start.tv_usec;
    endusec = end.tv_sec * 1000000 + end.tv_usec;
    printf("spend %lld us\n", (endusec - startusec));
}


int main(int argc,char **argv)
{
    if(argc != 3) {
        fprintf(stderr,"error args\n");
        return -1;
    }
    int nThr;//工作线程数
    int sum = 0;
    struct timeval start,end;
    n = atoi(argv[1]);
    nThr = atoi(argv[2]);
    long long startusec,endusec;
    Result *reArray = (Result*)malloc(sizeof(Result)*n);
    memset(reArray,0,sizeof(Result)*n);

    gettimeofday(&start,NULL);

    {
        ThreadPool tp(nThr);
        for(int i = 0; i < n; i++) {
            reArray[i] = {i + 1,0};
            tp.addTask(thrFun,(void*)(&reArray[i]));
        }
    }
    for(int i = 0; i < n; i++) {
        sum += reArray[i].count;
    }
    gettimeofday(&end,NULL);

    printf("result = %d\n",sum);
    startusec = start.tv_sec * 1000000 + start.tv_usec;
    endusec = end.tv_sec * 1000000 + end.tv_usec;
    printf("spend %lld us\n", (endusec - startusec));
    return 0;
}

编译后测试，发现计算时间还是有不小的提升的。

wd@ub-linux:Nhuanghou$ ./a.out 15 4   #15皇后，4个工作线程
_worker ar 7010
_worker ar 7012
_worker ar 7009
_worker ar 7011
threadpool stopped
7010 get a task
7012 get a task
7009 get a task
7011 get a task
7010 get a task
7012 get a task
7009 get a task
7011 get a task
7010 get a task
7012 get a task
7009 get a task
7011 get a task
7010 get a task
7012 get a task
7009 get a task
all threads exit
result = 2279184
spend 10.5478 s

可以发现这次15皇后只用了10s。快了不少。

但是有一个问题是，如果这里面只有一个工作线程

result = 2279184
spend 37.4849 s

会发现多花了将近10s.这有点可怕，(⊙o⊙)…，现在还没找到原因。。。各位若是知道为什么，望告知。

后面用perf看了下具体的执行情况如下

(原始的单进程的情况，N=15)

result count is 2279184
spend 25.2779 s

 Performance counter stats for './a.out 15':

      25262.110738      task-clock (msec)         #    0.999 CPUs utilized          
               400      context-switches          #    0.016 K/sec                  
                 0      cpu-migrations            #    0.000 K/sec                  
                50      page-faults               #    0.002 K/sec                  
    93,273,276,839      cycles                    #    3.692 GHz                    
   142,353,387,972      instructions              #    1.53  insn per cycle         
    17,434,460,005      branches                  #  690.143 M/sec                  
     1,307,320,846      branch-misses             #    7.50% of all branches        

      25.278347191 seconds time elapsed

（多线程版，但是设置只有一个工作线程）

result = 2279184
spend 36645590 us

 Performance counter stats for './a.out 15 1':

      36645.398798      task-clock (msec)         #    1.000 CPUs utilized          
               115      context-switches          #    0.003 K/sec                  
                 0      cpu-migrations            #    0.000 K/sec                  
               110      page-faults               #    0.003 K/sec                  
   135,389,381,037      cycles                    #    3.695 GHz                    
   225,738,593,916      instructions              #    1.67  insn per cycle         
    42,698,239,331      branches                  # 1165.173 M/sec                  
     1,254,707,682      branch-misses             #    2.94% of all branches        

      36.647456082 seconds time elapsed

照理，这个指令数应该和上面那个差不太多，但是现在却相差20亿左右。都是单线程计算，为什么后面这个会比前面那个多出一倍的指令数？算法的问题？

线程池代码

#ifndef __THREADPOOL_H__   /*ThreadPool.h*/
#define __THREADPOOL_H__
#include <pthread.h>
#include <unistd.h>
#include <sys/syscall.h>
#include <stdio.h>
#include <sys/types.h>

#include <deque>
#include <vector>
typedef void (*CallBack)(void *);
struct Task
{
    CallBack func;
    void* arg;
};

class ThreadPool
{
public:
    ThreadPool(int size);
    ~ThreadPool();
    //void start();
    void addTask(CallBack f,void* arg);
private:
    static void* _worker(void* arg);
    bool _stopped;
    int  _size;
    pthread_cond_t _cond;
    pthread_mutex_t _mutex;
    std::deque<Task> _tasks;
    std::vector<pthread_t> _threadVec;
};
#endif

#include "ThreadPool.h" /*ThreadPool.cpp*/
void* ThreadPool::_worker(void* arg)
{
    ThreadPool *obj = (ThreadPool*)arg;
    pid_t tid = syscall(SYS_gettid);
    printf("_worker ar %d\n",tid);
    while(1)
    {
        pthread_mutex_lock(&obj->_mutex);
        while(obj->_tasks.empty() && !obj->_stopped)
        {
            pthread_cond_wait(&obj->_cond,&obj->_mutex);
            printf("%d get signal\n",tid);
        }
        Task t;
        if(!obj->_tasks.empty())
        {
            printf("%d get a task\n",tid);
            t = obj->_tasks.front();
            obj->_tasks.pop_front();
            pthread_mutex_unlock(&obj->_mutex);
            t.func(t.arg);
            continue;
        }
        break;
    }
    pthread_mutex_unlock(&obj->_mutex);
    return (void*)0;

}
ThreadPool::ThreadPool(int size)
{
    pthread_t tid;
    _size = size;
    _stopped = 0;
    _cond = PTHREAD_COND_INITIALIZER;
    _mutex = PTHREAD_MUTEX_INITIALIZER;
    for(int i = 0;i < size;i++)
    {
        pthread_create(&tid,NULL,ThreadPool::_worker,this);
        _threadVec.push_back(tid);
    }
}
ThreadPool::~ThreadPool()
{

    pthread_mutex_lock(&_mutex);
    _stopped = 1;
    printf("threadpool stopped\n");
    pthread_mutex_unlock(&_mutex);
    int threadSize = _threadVec.size();
    for(int i = 0; i < threadSize; i++)
    {
        pthread_join(_threadVec[i],NULL);
        pthread_cond_broadcast(&_cond);
    }
    printf("all threads exit\n");
}

void ThreadPool::addTask(CallBack f,void* arg)
{
    Task t = {f,arg};
    pthread_mutex_lock(&_mutex);
    _tasks.push_back(t);
    pthread_cond_signal(&_cond);
    pthread_mutex_unlock(&_mutex);
}

本文内容由网友自发贡献，版权归原作者所有，本站不承担相应法律责任。如您发现有涉嫌抄袭侵权的内容，请联系:hwhale#tublm.com(使用前将#替换为@)