24: parallel algorithms #1

Bank Account | Matrix Multiply | Barrier | Parallel Scan
Naive Parallel Sort | P-Ary Search

Parallel Algorithms

In serial algorithms only one line of execution happens at one time and there is only one call stack. In parallel algorithms, multiple things happen at once and there are multiple call stacks, one for each thread.

Care has to be taken to ensure that the multiple threads of execution don't interfere with each other. If they do interfere, the results of the algorithm will be incorrect in a different way each time it is run (and this is without using rand() or any other random number generator)

Bank Account Example

If two ATMs are trying to deposit money into an account at the same time, the order of events can get interleaved and money can be lost.
1:  
// bankAccount.cpp - download here

2:  

3:  
#include
 
<
iostream>

4:  

5:  
class
 
Account 
{

6:  
public
:

7:  
 
 
Account(
)
;

8:  
 
 
int
 
getBalance(
)
;

9:  
 
 
void
 
setBalance(
int
 
value)
;

10: 
private
:

11: 
 
 
int
 
m_Balance;

12: 
}
;

13: 

14: 
Account:
:
Account(
)

15: 
 
:
 
m_Balance(
100)

16: 
{

17: 
}

18: 

19: 
int
 
Account:
:
getBalance(
)

20: 
{

21: 
 
 
return
 
m_Balance;

22: 
}

23: 

24: 
void
 
Account:
:
setBalance(
int
 
value)

25: 
{

26: 
 
 
m_Balance 
=
 
value;

27: 
}

28: 

29: 
int
 
main(
int
 
argc, 
char
 
argv[
]
)

30: 
{

31: 

32: 
 
 
Account 
account;

33: 

34: 
 
 
//thread1:

35: 
 
 
int
 
bal1 
=
 
account.getBalance(
)
;

36: 
 
 
bal1 
+=
 
20;

37: 

38: 
 
 
//thread2:

39: 
 
 
int
 
bal2 
=
 
account.getBalance(
)
;

40: 
 
 
bal2 
+=
 
20;

41: 

42: 
 
 
//thread1:

43: 
 
 
account.setBalance(
bal1)
;

44: 

45: 
 
 
//thread2:

46: 
 
 
account.setBalance(
bal2)
;

47: 

48: 
 
 
std:
:
cout 
<
<
 
"final balance: "
 
<
<
 
account.getBalance(
)
 
<
<
 
std:
:
endl;

49: 
 
 
//prints:

50: 
 
 
//  final balance: 120

51: 

52: 
 
 
return
 
0;

53: 
}

Parallel Matrix Multiplication

Each element of an output matrix can be computed independently:

matmult
(Image from wikipedia [1])

Non-parallel matrix multiplication (naive and cache optimized): (Note a power of 2 matrix size is particularly bad for the cache)
(compile with: $g++ matrixMultSerial.cpp UnixStopwatch.cpp)
1:  
// matrixMultSerial.cpp - download here

2:  

3:  
#include
 
"UnixStopwatch.h"

4:  
#include
 
<
iostream>

5:  

6:  
int
 
** 
newMatrix(
int
 
size)
{

7:  

8:  
 
 
int
 
** 
ret 
=
 
new
 
int*[
size]
;

9:  
 
 
for
(
int
 
=
 
0;
 
<
 
size;
 
++i)
{

10: 
 
 
 
 
ret[
i]
 
=
 
new
 
int
[
size]
;

11: 
 
 
 
 
for
(
int
 
=
 
0;
 
<
 
size;
 
++j)
{

12: 
 
 
 
 
 
 
ret[
i]
[
j]
 
=
 
j;

13: 
 
 
 
 
}

14: 
 
 
}

15: 
 
 

16: 
 
 
return
 
ret;

17: 
}

18: 

19: 
void
 
multiply(
int
 
** 
a, 
int
 
** 
b, 
int
 
** 
c, 
int
 
size)
{

20: 
 
 
for
(
int
 
=
 
0;
 
<
 
size;
 
++i)
{

21: 
 
 
 
 
for
(
int
 
=
 
0;
 
<
 
size;
 
++j)
{

22: 
 
 
 
 
 
 
int
 
sum 
=
 
0;

23: 
 
 
 
 
 
 
for
(
int
 
=
 
0;
 
<
 
size;
 
++k)
{

24: 
 
 
 
 
 
 
 
 
sum 
+=
 
a[
i]
[
k]
 
b[
k]
[
j]
;

25: 
 
 
 
 
 
 
}

26: 
 
 
 
 
 
 
c[
i]
[
j]
 
=
 
sum;

27: 
 
 
 
 
}

28: 
 
 
}

29: 
}

30: 

31: 
void
 
multiplyTranspose(
int
 
** 
a, 
int
 
** 
b, 
int
 
** 
c, 
int
 
size)
{

32: 
 
 
for
(
int
 
=
 
0;
 
<
 
size;
 
++i)
{

33: 
 
 
 
 
for
(
int
 
=
 
0;
 
<
 
size;
 
++j)
{

34: 
 
 
 
 
 
 
int
 
sum 
=
 
0;

35: 
 
 
 
 
 
 
for
(
int
 
=
 
0;
 
<
 
size;
 
++k)
{

36: 
 
 
 
 
 
 
 
 
sum 
+=
 
a[
i]
[
k]
 
b[
j]
[
k]
;

37: 
 
 
 
 
 
 
}

38: 
 
 
 
 
 
 
c[
i]
[
j]
 
=
 
sum;

39: 
 
 
 
 
}

40: 
 
 
}

41: 
}

42: 

43: 
int
 
main(
int
 
argc, 
char
 
argv[
]
)
{

44: 

45: 
 
 
int
 
size 
=
 
1024;

46: 
 
 
int
 
** 
=
 
newMatrix(
size)
;

47: 
 
 
int
 
** 
=
 
newMatrix(
size)
;

48: 
 
 
int
 
** 
=
 
newMatrix(
size)
;

49: 

50: 
 
 
UnixStopwatch 
watch;

51: 
 
 
multiply(
a, 
b, 
c, 
size)
;

52: 
 
 
watch.stop(
)
;

53: 

54: 
 
 
std:
:
cout 
<
<
 
"regular multiply elapsed ms: "
 
<
<
 
watch.getTime(
)
 
<
<
 
std:
:
endl;

55: 

56: 
 
 
watch.start(
)
;

57: 
 
 
multiplyTranspose(
a, 
b, 
c, 
size)
;

58: 
 
 
watch.stop(
)
;

59: 

60: 
 
 
std:
:
cout 
<
<
 
"transpose multiply elapsed ms: "
 
<
<
 
watch.getTime(
)
 
<
<
 
std:
:
endl;

61: 

62: 
 
 
//prints:

63: 
 
 
//  regular multiply elapsed ms: 29622

64: 
 
 
//  transpose multiply elapsed ms: 16640

65: 

66: 
 
 
return
 
0;

67: 
}

Parallel matrix multiplication
(compile with: $g++ matrixMultParallel.cpp UnixStopwatch.cpp -lpthread)
1:   
// matrixMultParallel.cpp - download here

2:   

3:   
#include
 
"UnixStopwatch.h"

4:   
#include
 
<
iostream>

5:   
#include
 
<
pthread.h>

6:   

7:   
int
 
** 
newMatrix(
int
 
size)
{

8:   

9:   
 
 
int
 
** 
ret 
=
 
new
 
int*[
size]
;

10:  
 
 
for
(
int
 
=
 
0;
 
<
 
size;
 
++i)
{

11:  
 
 
 
 
ret[
i]
 
=
 
new
 
int
[
size]
;

12:  
 
 
 
 
for
(
int
 
=
 
0;
 
<
 
size;
 
++j)
{

13:  
 
 
 
 
 
 
ret[
i]
[
j]
 
=
 
j;

14:  
 
 
 
 
}

15:  
 
 
}

16:  
 
 

17:  
 
 
return
 
ret;

18:  
}

19:  

20:  
void
 
serialMultiply(
int
 
** 
a, 
int
 
** 
b, 
int
 
** 
c, 
int
 
size)
{

21:  
 
 
for
(
int
 
=
 
0;
 
<
 
size;
 
++i)
{

22:  
 
 
 
 
for
(
int
 
=
 
0;
 
<
 
size;
 
++j)
{

23:  
 
 
 
 
 
 
int
 
sum 
=
 
0;

24:  
 
 
 
 
 
 
for
(
int
 
=
 
0;
 
<
 
size;
 
++k)
{

25:  
 
 
 
 
 
 
 
 
sum 
+=
 
a[
i]
[
k]
 
b[
k]
[
j]
;

26:  
 
 
 
 
 
 
}

27:  
 
 
 
 
 
 
c[
i]
[
j]
 
=
 
sum;

28:  
 
 
 
 
}

29:  
 
 
}

30:  
}

31:  

32:  
struct
 
ThreadData 
{

33:  
 
 
int
 
start_row;

34:  
 
 
int
 
stop_row;

35:  
 
 
int
 
size;

36:  
 
 
int
 
** 
a;

37:  
 
 
int
 
** 
b;

38:  
 
 
int
 
** 
c;

39:  
}
;

40:  

41:  
void
 
threadProc(
void
 
data)
{

42:  
 
 
ThreadData 
thread_data 
=
 
(
ThreadData 
*)
 
data;

43:  
 
 
int
 
start_row 
=
 
thread_data->
start_row;

44:  
 
 
int
 
stop_row 
=
 
thread_data->
stop_row;

45:  
 
 
int
 
size 
=
 
thread_data->
size;

46:  
 
 
int
 
** 
=
 
thread_data->
a;

47:  
 
 
int
 
** 
=
 
thread_data->
b;

48:  
 
 
int
 
** 
=
 
thread_data->
c;

49:  

50:  
 
 
for
(
int
 
=
 
start_row;
 
<
 
stop_row;
 
++i)
{

51:  
 
 
 
 
for
(
int
 
=
 
0;
 
<
 
size;
 
++j)
{

52:  
 
 
 
 
 
 
int
 
sum 
=
 
0;

53:  
 
 
 
 
 
 
for
(
int
 
=
 
0;
 
<
 
size;
 
++k)
{

54:  
 
 
 
 
 
 
 
 
sum 
+=
 
a[
i]
[
k]
 
b[
k]
[
j]
;

55:  
 
 
 
 
 
 
}

56:  
 
 
 
 
 
 
c[
i]
[
j]
 
=
 
sum;

57:  
 
 
 
 
}

58:  
 
 
}

59:  
}

60:  

61:  
void
 
parallelMultiply(
int
 
** 
a, 
int
 
** 
b, 
int
 
** 
c, 
int
 
size)

62:  
{

63:  
 
 
const
 
int
 
num_threads 
=
 
4;

64:  
 
 
pthread_t 
threads[
num_threads]
;

65:  
 
 
ThreadData 
data[
num_threads]
;

66:  

67:  
 
 
int
 
num_each 
=
 
size 
num_threads;

68:  

69:  
 
 
for
(
int
 
=
 
0;
 
<
 
num_threads;
 
++i)
{

70:  
 
 
 
 
data[
i]
.a 
=
 
a;

71:  
 
 
 
 
data[
i]
.b 
=
 
b;

72:  
 
 
 
 
data[
i]
.c 
=
 
c;

73:  
 
 
 
 
data[
i]
.size 
=
 
size;

74:  
 
 
 
 
data[
i]
.start_row 
=
 
num_each;

75:  
 
 
 
 
data[
i]
.stop_row 
=
 
(
1)
 
num_each;

76:  
 
 
 
 
pthread_create(
&
threads[
i]
NULL, 
&
threadProc, 
&
data[
i]
)
;

77:  
 
 
}

78:  

79:  
 
 
for
(
int
 
=
 
0;
 
<
 
num_threads;
 
++i)
{

80:  
 
 
 
 
pthread_join(
threads[
i]
NULL)
;

81:  
 
 
}

82:  
}

83:  

84:  
void
 
verifyResults(
int
 
** 
c1, 
int
 
** 
c2, 
int
 
size)

85:  
{

86:  
 
 
for
(
int
 
=
 
0;
 
<
 
size;
 
++i)
{

87:  
 
 
 
 
for
(
int
 
=
 
0;
 
<
 
size;
 
++j)
{

88:  
 
 
 
 
 
 
int
 
lhs 
=
 
c1[
i]
[
j]
;

89:  
 
 
 
 
 
 
int
 
rhs 
=
 
c2[
i]
[
j]
;

90:  

91:  
 
 
 
 
 
 
if
(
lhs 
!=
 
rhs)
{

92:  
 
 
 
 
 
 
 
 
std:
:
cout 
<
<
 
"results do not match!"
 
<
<
 
std:
:
endl;

93:  
 
 
 
 
 
 
}

94:  
 
 
 
 
}

95:  
 
 
}

96:  
 
 
std:
:
cout 
<
<
 
"results match!"
 
<
<
 
std:
:
endl;

97:  
}

98:  

99:  
int
 
main(
int
 
argc, 
char
 
argv[
]
)
{

100: 

101: 
 
 
int
 
size 
=
 
1024;

102: 
 
 
int
 
** 
=
 
newMatrix(
size)
;

103: 
 
 
int
 
** 
=
 
newMatrix(
size)
;

104: 
 
 
int
 
** 
c1 
=
 
newMatrix(
size)
;

105: 
 
 
int
 
** 
c2 
=
 
newMatrix(
size)
;

106: 

107: 
 
 
UnixStopwatch 
watch;

108: 
 
 
serialMultiply(
a, 
b, 
c1, 
size)
;

109: 
 
 
watch.stop(
)
;

110: 
 
 
std:
:
cout 
<
<
 
"serial time ms: "
 
<
<
 
watch.getTime(
)
 
<
<
 
std:
:
endl;

111: 

112: 
 
 
watch.start(
)
;

113: 
 
 
parallelMultiply(
a, 
b, 
c2, 
size)
;

114: 
 
 
watch.stop(
)
;

115: 
 
 
std:
:
cout 
<
<
 
"parallel time ms: "
 
<
<
 
watch.getTime(
)
 
<
<
 
std:
:
endl;

116: 

117: 
 
 
verifyResults(
c1, 
c2, 
size)
;

118: 

119: 
 
 
//prints:

120: 
 
 
//  serial time ms: 28821

121: 
 
 
//  parallel time ms: 6629

122: 
 
 
//  results match!

123: 
 
 

124: 
 
 
return
 
0;

125: 
}

Barrier

1:  
// Barrier.cpp - download here

2:  

3:  
#include
 
"Barrier.h"

4:  
#include
 
"Thread.h"

5:  

6:  
Barrier:
:
Barrier(
)

7:  
 
 
:
 
m_count(
0)
m_count2(
0)
m_count3(
0)

8:  
{

9:  
}

10: 

11: 
void
 
Barrier:
:
wait(
)

12: 
{

13: 
 
 
m_mutex.lock(
)
;

14: 
 
 
m_count2 
=
 
0;

15: 
 
 
m_count++;

16: 
 
 
m_mutex.unlock(
)
;

17: 

18: 
 
 
while
(
true)
{

19: 
 
 
 
 
m_mutex.lock(
)
;

20: 
 
 
 
 
volatile 
int
 
count 
=
 
m_count;

21: 
 
 
 
 
m_mutex.unlock(
)
;

22: 
 
 
 
 

23: 
 
 
 
 
if
(
count 
=
=
 
(
int
)
 
Thread:
:
numCores(
)
)
{

24: 
 
 
 
 
 
 
break
;

25: 
 
 
 
 
}
 
else
 
{

26: 
 
 
 
 
 
 
Thread:
:
sleep(
10)
;

27: 
 
 
 
 
}

28: 
 
 
}

29: 

30: 
 
 
m_mutex.lock(
)
;

31: 
 
 
m_count3 
=
 
0;

32: 
 
 
m_count2++;

33: 
 
 
m_mutex.unlock(
)
;

34: 

35: 
 
 
while
(
true)
{

36: 
 
 
 
 
m_mutex.lock(
)
;

37: 
 
 
 
 
volatile 
int
 
count 
=
 
m_count2;

38: 
 
 
 
 
m_mutex.unlock(
)
;

39: 
 
 
 
 

40: 
 
 
 
 
if
(
count 
=
=
 
(
int
)
 
Thread:
:
numCores(
)
)
{

41: 
 
 
 
 
 
 
break
;

42: 
 
 
 
 
}
 
else
 
{

43: 
 
 
 
 
 
 
Thread:
:
sleep(
10)
;

44: 
 
 
 
 
}

45: 
 
 
}

46: 

47: 
 
 
m_mutex.lock(
)
;

48: 
 
 
m_count 
=
 
0;

49: 
 
 
m_count3++;

50: 
 
 
m_mutex.unlock(
)
;

51: 

52: 
 
 
while
(
true)
{

53: 
 
 
 
 
m_mutex.lock(
)
;

54: 
 
 
 
 
volatile 
int
 
count 
=
 
m_count3;

55: 
 
 
 
 
m_mutex.unlock(
)
;

56: 
 
 
 
 

57: 
 
 
 
 
if
(
count 
=
=
 
(
int
)
 
Thread:
:
numCores(
)
)
{

58: 
 
 
 
 
 
 
break
;

59: 
 
 
 
 
}
 
else
 
{

60: 
 
 
 
 
 
 
Thread:
:
sleep(
10)
;

61: 
 
 
 
 
}

62: 
 
 
}

63: 
}

Parallel Scan

Sum and Min/Max can easily be implemented using P parallel processors in O(lg_p(N))

Example of parallel sum with 4 processors:

Starting Sequence:
0
1
2
3
4
5
6
7




4 processors put sum of 2 elements into 0th sub-location in output array:
1
5
9
13




2 processors put sum of 2 elements into 0th sub-location in output array:
6
22




1 processor puts sum of 2 elements into 0th sub-location in output array:
28




A similar process can easily be done to find the min or max. Parallel scan works well with many many cores such as a Graphics Processing Unit (GPU).

Parallel Sum Example for hw7

1:   
// parallelSum.cpp - download here

2:   

3:   
#include
 
"Thread.h"

4:   
#include
 
"Barrier.h"

5:   
#include
 
"BlockingQueue.h"

6:   
#include
 
<
vector>

7:   
#include
 
<
iostream>

8:   

9:   
class
 
psumJob 
{

10:  
public
:

11:  
 
 
std:
:
vector<
int
>
 
input_vec;

12:  
 
 
std:
:
vector<
long
 
long
>
 
output_vec;

13:  
 
 
int
 
size;

14:  
 
 
long
 
long
 
ret;

15:  
}
;

16:  

17:  
class
 
psumThread 
:
 
public
 
Thread 
{

18:  
public
:

19:  
 
 
psumThread(
BlockingQueue<
psumJob>
 
input_q, 

20:  
 
 
 
 
 
 
 
 
 
 
 
 
 
BlockingQueue<
psumJob>
 
output_q,

21:  
 
 
 
 
 
 
 
 
 
 
 
 
 
int
 
thread_id, 
Barrier 
barrier)

22:  

23:  
 
 
 
 
:
 
m_input_q(
input_q)
m_output_q(
output_q)

24:  
 
 
 
 
 
 
m_threadId(
thread_id)
m_barrier(
barrier)

25:  
 
 
{

26:  
 
 
}

27:  
 

28:  
 
 
void
 
run(
)
{

29:  
 
 
 
 
while
(
true)
{

30:  
 
 
 
 
 
 
psumJob 
curr_job 
=
 
m_input_q->
pop_front(
)
;

31:  
 
 
 
 
 
 
long
 
long
 
sum 
=
 
0;

32:  
 
 
 
 
 
 
for
(
int
 
=
 
(
m_threadId 
curr_job.size)
;
 
<
 
(
(
m_threadId+1)
 
curr_job.size)
;
 
++i)
{

33:  
 
 
 
 
 
 
 
 
sum 
+=
 
curr_job.input_vec->
at(
i)
;

34:  
 
 
 
 
 
 
}

35:  
 
 
 
 
 
 
curr_job.output_vec->
at(
m_threadId)
 
=
 
sum;

36:  
 
 
 
 
 
 
m_barrier->
wait(
)
;

37:  

38:  
 
 
 
 
 
 
if
(
m_threadId 
=
=
 
0)
{

39:  
 
 
 
 
 
 
 
 
long
 
long
 
ret 
=
 
0;

40:  
 
 
 
 
 
 
 
 
for
(
int
 
=
 
0;
 
<
 
Thread:
:
numCores(
)
;
 
++i)
{

41:  
 
 
 
 
 
 
 
 
 
 
ret 
+=
 
curr_job.output_vec->
at(
i)
;

42:  
 
 
 
 
 
 
 
 
}

43:  
 
 
 
 
 
 
 
 
curr_job.ret 
=
 
ret;

44:  
 
 
 
 
 
 
 
 
m_output_q->
push_back(
curr_job)
;

45:  
 
 
 
 
 
 
}

46:  

47:  
 
 
 
 
}

48:  
 
 
}

49:  

50:  
private
:

51:  
 
 
BlockingQueue<
psumJob>
 
m_input_q;

52:  
 
 
BlockingQueue<
psumJob>
 
m_output_q;

53:  
 
 
int
 
m_threadId;

54:  
 
 
Barrier 
m_barrier;

55:  
}
;

56:  

57:  
class
 
psum 
{

58:  
public
:

59:  

60:  
 
 
psum(
)
{

61:  
 
 
 
 
for
(
int
 
=
 
0;
 
<
 
Thread:
:
numCores(
)
;
 
++i)
{

62:  
 
 
 
 
 
 
Thread 
curr 
=
 
new
 
psumThread(
&
m_toCores, 
&
m_fromCores, 
i, 
&
m_barrier)
;

63:  
 
 
 
 
 
 
m_threads.push_back(
curr)
;

64:  
 
 
 
 
 
 
curr->
start(
)
;

65:  
 
 
 
 
}

66:  
 
 
}

67:  

68:  
 
 
long
 
long
 
sum(
std:
:
vector<
int
>
&
 
vec)
{

69:  
 
 
 
 
std:
:
vector<
long
 
long
>
 
output_vec;

70:  
 
 
 
 
output_vec.resize(
vec.size(
)
)
;

71:  

72:  
 
 
 
 
for
(
int
 
=
 
0;
 
<
 
Thread:
:
numCores(
)
;
 
++i)
{

73:  
 
 
 
 
 
 
psumJob 
job;

74:  
 
 
 
 
 
 
job.input_vec 
=
 
&
vec;

75:  
 
 
 
 
 
 
job.output_vec 
=
 
&
output_vec;

76:  
 
 
 
 
 
 
job.size 
=
 
vec.size(
)
 
Thread:
:
numCores(
)
;

77:  

78:  
 
 
 
 
 
 
m_toCores.push_back(
job)
;

79:  
 
 
 
 
}

80:  

81:  
 
 
 
 
psumJob 
job 
=
 
m_fromCores.pop_front(
)
;

82:  
 
 
 
 
return
 
job.ret;

83:  
 
 
}

84:  

85:  
private
:

86:  
 
 
std:
:
vector<
Thread 
*>
 
m_threads;

87:  
 
 
BlockingQueue<
psumJob>
 
m_toCores;

88:  
 
 
BlockingQueue<
psumJob>
 
m_fromCores;
 

89:  
 
 
Barrier 
m_barrier;

90:  
}
;

91:  

92:  
long
 
long
 
serialSum(
std:
:
vector<
int
>
&
 
vec)

93:  
{

94:  
 
 
long
 
long
 
ret 
=
 
0;

95:  
 
 
for
(
int
 
=
 
0;
 
<
 
vec.size(
)
;
 
++i)
{

96:  
 
 
 
 
ret 
+=
 
vec[
i]
;

97:  
 
 
}

98:  
 
 
return
 
ret;

99:  
}

100: 

101: 
int
 
main(
int
 
argc, 
char
 
argv[
]
)
{

102: 

103: 
 
 
psum 
p;

104: 
 
 
std:
:
vector<
int
>
 
vec;

105: 
 
 
for
(
int
 
=
 
0;
 
<
 
8*1024*1024;
 
++i)
{

106: 
 
 
 
 
vec.push_back(
i)
;

107: 
 
 
}

108: 

109: 
 
 
std:
:
cout 
<
<
 
p.sum(
vec)
 
<
<
 
std:
:
endl;

110: 
 
 
std:
:
cout 
<
<
 
p.sum(
vec)
 
<
<
 
std:
:
endl;

111: 
 
 
std:
:
cout 
<
<
 
p.sum(
vec)
 
<
<
 
std:
:
endl;

112: 
 
 

113: 
 
 
std:
:
cout 
<
<
 
serialSum(
vec)
 
<
<
 
std:
:
endl;
 
 

114: 
 
 

115: 
 
 
return
 
0;

116: 
}


psum.zip

Naive Parallel Sort (Many Processors)

A Naive Parallel Sort with many processors (P) divides the input array into P sub arrays, sorts them and then does log_P(N) merges. The time complexity is O((n/p * log_2(n / p) + (n*log_P(N)))

Example with 4 processors:

Starting Sequence:
11
10
9
8
7
6
5
4
3
2
1
0




Sort 3 elements on each of the 4 processors in parallel:
9
10
11
6
7
8
3
4
5
0
1
2




Merge 3 elements with 3 elements using 2 total processors in parallel:
6
7
8
9
10
11
0
1
2
3
4
5




Merge 6 elements with 6 elements using 1 total processor:
0
1
2
3
4
5
6
7
8
9
10
11




If there were a system with many many x86 CPU cores, this would probably work well and it is fairly easy to write the code compared to the more complicated parallel sorting algorithms. The slowest part is the merging and actually doesn't effectively utilize all the available processing power. On GPUs this is slow because each processing core is much slower than a regular x86 CPU core.

P-Ary Search

Instead of Bin-ary search (uses divisions of 2), P-ary search uses divisions of P.

Searching for 23. 4 Processors each check the edges of each sub-group

6
8
9
10
11
12
15
16
17
20
23
50




3 Processors each check remaining elements

20
23
50




References

  1. http://en.wikipedia.org/wiki/Matrix_multiplication
  2. http://static.usenix.org/event/hotpar09/tech/full_papers/kaldeway/kaldeway_html/