Calculating the second virial coefficient ...

... or the importance of vectorizing

We want to calculate the second virial coefficient for a system of hard spherocylinders using Monte Carlo integration. Our aim is to compare four different methods for efficiency, namely:

Using loops in Python

Using a vectorized approach in Python

Using Fortran code externally

Using a Fortran function we import into Python

We shall start with the first one, using loops in Python, by coding this in as we would if we were using C or Fortran.

First, we import some libraries/modules

import numpy as np                     #we always need arrays
import pandas as pd                    #we want to put our results in a table
import matplotlib.pyplot as plt        #and we wouldn't mind plotting them
from scipy.optimize import brentq      #we will need to find the root of a function
import time                            #and of course, we want to time everything

We now need to define a few functions: the following two are used to find the value of alpha for a given nematic order parameter

def nematic_orderparam(alpha, S2):
    """Function to calculate the nematic order parameter for a given alpha, minus a guess"""
    return 1.0 + 3.0/(alpha*alpha) - 3.0/(alpha*np.tanh(alpha))-S2

def find_alpha(S2):
    """Function to find the alpha for a given nematic order parameter"""
    return brentq(nematic_orderparam, 1e-10, 2e3, args = (S2))

Now we define three functions to generate a unit vector for three different systems:

iso: the isotropic phase, where the particles point in each direction with equal chance
nem: a perfectly aligned nematic phase, where all the particles point straight up
onsager: generates a unit vector from the Onsager distribution function, for a given nematic order parameter

def iso():
    """Generate a unit vector for a particle in the isotropic phase"""
    u = np.empty(3)
    u[2] = np.random.uniform()
    x = twopi*np.random.uniform()
    fac = np.sqrt(1.0-u[2]*u[2])
    u[0] = fac*np.cos(x)
    u[1] = fac*np.sin(x)
    return u

def nem():
    """Generate a unit vector for a particle in a perfectly aligned nematic phase"""
    u = np.zeros(3)
    u[2] = 1.0
    return u

def onsager(S2, alpha, sh_alpha):
    """Generate a unit vector from the Onsage trial function for a given nematic order parameter S2"""
    u = np.empty(3)
    x = np.random.uniform()*sh_alpha
    u[2] = np.log(x + np.sqrt(1.0 + x*x))/alpha
    x = twopi*np.random.uniform()
    fac = np.sqrt(1.0-u[2]*u[2])
    u[0] = fac*np.cos(x)
    u[1] = fac*np.sin(x)
    return u

And we also define a function to test for overlaps between particles

def overlap(sep,ui,uj):
    """Test for overlaps between two spherocylinders: returns True if they do"""
    over = False
    rui   = np.sum(sep*ui)
    ruj   = np.sum(sep*uj)
    uij   = np.sum(ui*uj)
    sinsq = 1.0 - np.sum(uij*uij)
    if sinsq < 1e-14:
        ci = -rui*0.5
        cj = ruj*0.5
    else:
        ci = (-rui+uij*ruj)/sinsq
        cj = (ruj-uij*rui)/sinsq
    if np.maximum(np.absolute(ci), np.absolute(cj)) >= xld2:
        if np.absolute(ci) >= np.absolute(cj):
            ci = xld2*np.sign(ci)
            cj = ruj + ci*uij
        else:
            cj = xld2*np.sign(cj)
            ci = -rui + cj*uij
        if np.absolute(ci) > xld2:
            ci = xld2*np.sign(ci)
        if np.absolute(cj) > xld2:
            cj = xld2*np.sign(cj)
    di = 2.0*rui  + ci-cj*uij
    dj = -2.0*ruj + cj-ci*uij
    sepsq = np.sum(sep*sep)
    if sepsq + ci*di + cj*dj < 1.0:
        over = True
    return over

And finally we need a function that calculates the value of the virial coefficients.

We set a number of S2 values that we want to study (n_points values, ranging from 0 to 1 in equally spaced steps).
We also set the number of configurations we will generate (i.e. the number of MC iterations). This is split into batches.
The funciton returns a dictionary containing the order parameters, the values for the virials and the errors

def onsager_B2(n_batch,n_trial, n_points):
    """Calculate the nematic order parameter using n_batch*n_trial configurations, for n_points order parameters"""
    rmax = 2.0*xld2 + 1.0
    fac = 4.0*rmax*rmax*rmax
    r1 = np.zeros(3)
    u1 = np.zeros(3)
    u2 = np.zeros(3)
    output = {}
    output['S2'] = np.zeros(n_points)
    output['B2'] = np.zeros(n_points)
    output['error'] = np.zeros(n_points)
    for i_s in range(n_points):
        S2 = i_s/(n_points-1)
        if S2 < 1.0 and S2 > 0.0:
            alpha = find_alpha(S2)
            sh_alpha = np.sinh(alpha)
        B2 = 0.0
        B2sq  = 0.0
        error = 0.0
        for ib in range(n_batch):
            B2_sub = 0.0
            n_over = 0.0
            for it in range(n_trial):
                r2 = np.random.uniform(-rmax,rmax,3)
                if S2 < 1e-10:
                    u1 = iso()
                    u2 = iso()
                elif S2 > 1.0-1e-10:
                    u1 = nem()
                    u2 = nem()
                else:
                    u1 = onsager(S2, alpha, sh_alpha)
                    u2 = onsager(S2, alpha, sh_alpha)
                if overlap(r2-r1, u1, u2):
                    n_over += 1.0
            B2_sub = n_over/n_trial
            B2    += B2_sub
            B2sq  += B2_sub*B2_sub
        B2 = B2/n_batch
        B2sq = B2sq/n_batch
        error = np.sqrt((B2sq-B2*B2)/(n_batch-1))
        output['S2'][i_s] =  S2
        output['B2'][i_s] =  B2*fac
        output['error'][i_s] =  error*fac
        print(output['S2'][i_s] ,output['B2'][i_s] ,output['error'][i_s] )
    return output

And now we run it all. We take the time before and after to measure performance.

We will look at spherocylinders with L/D = 5, which means the parameter xld2 = L/2 = 2.5

twopi = 2.0*np.pi
xld2 = 2.5
t0 = time.clock()
B2 = onsager_B2(200,200,21)
output_table = pd.DataFrame(B2)
output_table = output_table[["S2", "B2", "error"]]
#print(output_table)
t1 = time.clock()
print('')
print('CPU time for loops in Python:', t1-t0)

0.0 37.0224 0.867525871017
0.05 39.4416 0.882955286176
0.1 35.9856 0.802147220914
0.15 36.6552 0.819992673579
0.2 36.936 0.877329341735
0.25 35.748 0.848080730015
0.3 36.1152 0.835885907918
0.35 35.748 0.828503233573
0.4 35.0568 0.863010337722
0.45 33.9552 0.902769944484
0.5 35.1648 0.851998658078
0.55 33.8688 0.782816672352
0.6 31.9248 0.750617276834
0.65 32.1408 0.814635619626
0.7 30.6936 0.762152595017
0.75 30.6072 0.751233906792
0.8 28.08 0.734968033353
0.85 27.648 0.783750549999
0.9 26.0928 0.723937294822
0.95 23.544 0.728399249956
1.0 18.7056 0.659021051966

CPU time for loops in Python: 60.472637000000006

This took about 1 minute for 200*200 = 40,000 trial configurations. Not that fast.

Now we will rewrite these functions to do as much of the computation as we can vector-wise. This makes it all much faster, since then more of the run takes place in the compiled C/Fortran code that this is all built on, rather than in python loops.

def overlap_vector(sep,ui,uj,n_loop):
    """Function to test for overlaps between particles, takes n_loop configurations at once"""
    rui   = np.sum(sep*ui,axis=1)
    ruj   = np.sum(sep*uj,axis=1)
    uij   = np.sum(ui*uj,axis=1)
    sinsq = 1.0 - uij*uij
    ci = np.empty(n_loop)
    cj = np.empty(n_loop)
    listings = sinsq < 1e-14
    nlistings = ~listings
    ci[listings] = -rui[listings]*0.5
    cj[listings] = ruj[listings]*0.5
    ci[nlistings] = (-rui[nlistings]+uij[nlistings]*ruj[nlistings])/sinsq[nlistings]
    cj[nlistings] = (ruj[nlistings]-uij[nlistings]*rui[nlistings])/sinsq[nlistings]
    listings = (np.maximum(np.absolute(ci), np.absolute(cj)) >= xld2) & (np.absolute(ci) >= np.absolute(cj))
    ci[listings] = xld2*np.sign(ci[listings])
    cj[listings] = ruj[listings] + ci[listings]*uij[listings]
    listings = (np.maximum(np.absolute(ci), np.absolute(cj)) >= xld2) & (np.absolute(ci) < np.absolute(cj))
    cj[listings] = xld2*np.sign(cj[listings])
    ci[listings] = -rui[listings] + cj[listings]*uij[listings]
    listings = (np.maximum(np.absolute(ci), np.absolute(cj)) >= xld2) &  (np.absolute(ci) > xld2)
    ci[listings] = xld2*np.sign(ci[listings])
    listings = (np.maximum(np.absolute(ci), np.absolute(cj)) >= xld2) &  (np.absolute(cj) > xld2)
    cj[listings] = xld2*np.sign(cj[listings])
    di = 2.0*rui  + ci-cj*uij
    dj = -2.0*ruj + cj-ci*uij
    sepsq = np.sum(sep*sep, axis=1)
    over = sepsq + ci*di + cj*dj < 1.0
    return over

def iso_vector(n_loop):
    """Generates n_loop unit vectors in the isotropic phase"""
    u = np.empty((n_loop, 3))
    u[:,2] = np.random.uniform(0,1,n_loop)
    x = np.random.uniform(0,twopi,n_loop)
    fac = np.sqrt(1.0-u[:,2]*u[:,2])
    u[:,0] = fac*np.cos(x)
    u[:,1] = fac*np.sin(x)
    return u

def nem_vector(n_loop):
    """Generates n_loop unit vectors in the perfectly aligned nematic phase"""
    u = np.zeros((n_loop, 3))
    u[:,2] = 1.0
    return u

def onsager_vector(S2, alpha, sh_alpha, n_loop):
    """Generates n_loop unit vectors from the Onsager trial function for a given nematic order parameter"""
    u = np.empty((n_loop, 3))
    x = np.random.uniform(0,sh_alpha,n_loop)
    u[:,2] = np.log(x + np.sqrt(1.0 + x*x))/alpha
    x = np.random.uniform(0,twopi,n_loop)
    fac = np.sqrt(1.0-u[:,2]*u[:,2])
    u[:,0] = fac*np.cos(x)
    u[:,1] = fac*np.sin(x)
    return u

def onsager_B2_vectorized(n_loop, n_points):
    """Calculates the second virial coefficient, using n_loop configurations for each of the n_points values of S_nem"""
    rmax = 2.0*xld2 + 1.0
    fact = 4.0*rmax*rmax*rmax
    output = {}
    output['S2'] = np.zeros(n_points)
    output['B2'] = np.zeros(n_points)
    output['error'] = np.zeros(n_points)
    B2_sub = np.empty(n_loop)
    r1 = np.zeros(3)
    for i_s in range(n_points):
        r2 = np.random.uniform(-rmax,rmax,(n_loop,3))
        S2 = i_s/(n_points-1)
        if S2 < 1.0 and S2 > 0.0:
            alpha = find_alpha(S2)
            sh_alpha = np.sinh(alpha)
        if S2 < 1e-10:
            u1 = iso_vector(n_loop)
            u2 = iso_vector(n_loop)
        elif S2 > 1.0-1e-10:
            u1 = nem_vector(n_loop)
            u2 = nem_vector(n_loop)
        else:
            u1 = onsager_vector(S2, alpha, sh_alpha, n_loop)
            u2 = onsager_vector(S2, alpha, sh_alpha, n_loop)
        n_over = overlap_vector(r2-r1, u1, u2, n_loop)
       # n_over = np.zeros(n_loop)
       # for i in range(n_loop):
       #     if overlap(r2[i,:]-r1[:], u1[i,:], u2[i,:]):
       #         n_over[i] += 1.0
        #print(np.sum(n_over), (u1*u1).sum(axis))
        B2 = fact*np.sum(n_over)/n_loop
        error = fact*n_over.std()/np.sqrt(n_loop-1)
        output['S2'][i_s] =  S2
        output['B2'][i_s] =  B2
        output['error'][i_s] =  error
        print(output['S2'][i_s] ,output['B2'][i_s] ,output['error'][i_s] )
    return output

Now we run it all. Before we set n_trial = n_batch = 200, giving 200*200 = 40,000 configurations.

Now we will use n_loop = 2000*2000 = 4,000,000 configurations

twopi = 2.0*np.pi
xld2 = 2.5
t0 = time.clock()
B2_vec = onsager_B2_vectorized(2000*2000,21)
output_table_vec = pd.DataFrame(B2_vec)
output_table_vec = output_table_vec[["S2", "B2", "error"]]
#print(output_table_vec)
t1 = time.clock()
print('')
print('CPU time for vectorized method:', t1-t0)

0.0 37.451755102 0.0837820429342
0.05 37.3804408163 0.0837058483699
0.1 37.4033632653 0.0837303485435
0.15 37.0781387755 0.0833819320596
0.2 36.9000489796 0.0831904020829
0.25 36.6526040816 0.0829234045625
0.3 36.3585306122 0.0826047531538
0.35 36.0583836735 0.082278002326
0.4 35.4943346939 0.0816597443248
0.45 35.0564571429 0.0811759243677
0.5 34.413844898 0.0804596429098
0.55 33.7296979592 0.0796886982638
0.6 32.749322449 0.0785684020312
0.65 32.0457795918 0.0777527735332
0.7 31.2454530612 0.0768126368573
0.75 30.0644571429 0.0754004052049
0.8 29.0276571429 0.0741349146864
0.85 27.5147755102 0.0722425186021
0.9 25.789322449 0.0700127787904
0.95 23.5305795918 0.0669665653808
1.0 17.7982040816 0.058439409235

CPU time for vectorized method: 60.933426999999995

We acheived a similar time of 1minute, but we generated 100x more trial configurations (which means that the error on our result is 10 times smaller).

We now plot the two results, from which the reduction in the error should be visible.

plt.errorbar(output_table.S2, output_table.B2, output_table.error, color = "red", 
             label="original", marker = ".", markersize='10', elinewidth=2)
plt.errorbar(output_table_vec.S2, output_table_vec.B2, output_table_vec.error, 
             color="blue", label = "vectorized", marker = "s", markersize='7', elinewidth=2)
plt.legend(loc="lower left")
plt.xlabel("S2")
plt.ylabel("B2")
plt.margins(0.02)
plt.title("B2 vs S2: after 1 minute computation time")
plt.show()

Of course, the method using loops can be coded in in Fortran, and will run 3.5 times faster than above (so will reduce the error still further, by about a factor of 1.9, for the same computing time. But, if we do want to use python, vectorizing is the way to go.

Let's now add our extermal Fortran result to the plot

fortran_result=pd.read_csv("B2_L5d0_D1d0-race-csv.dat", header=None, names=["S2", "alpha", "B2", "error"])
print(fortran_result)

      S2         alpha         B2     error
0   0.00  0.000000e+00  37.423111  0.045164
1   0.05  8.985869e-01  37.404843  0.047013
2   0.10  1.321999e+00  37.369543  0.046639
3   0.15  1.689315e+00  37.175451  0.046346
4   0.20  2.042078e+00  36.897614  0.046444
5   0.25  2.399357e+00  36.697721  0.046934
6   0.30  2.774537e+00  36.255847  0.045914
7   0.35  3.179995e+00  35.959680  0.045229
8   0.40  3.629410e+00  35.423321  0.045134
9   0.45  4.139693e+00  34.992185  0.045549
10  0.50  4.733319e+00  34.294320  0.045141
11  0.55  5.441841e+00  33.675758  0.044878
12  0.60  6.311799e+00  32.876866  0.044084
13  0.65  7.415565e+00  32.064706  0.042787
14  0.70  8.872984e+00  31.135166  0.042287
15  0.75  1.089898e+01  30.114041  0.041977
16  0.80  1.392262e+01  28.845751  0.041103
17  0.85  1.894427e+01  27.531175  0.040358
18  0.90  2.896424e+01  25.803360  0.040318
19  0.95  5.898275e+01  23.523634  0.037443
20  1.00  1.000000e+14  17.808274  0.031975

plt.errorbar(output_table.S2, output_table.B2, output_table.error, color = "red", 
             label="original", marker = ".", markersize='10', elinewidth=2)

plt.errorbar(output_table_vec.S2, output_table_vec.B2, output_table_vec.error, 
             color="blue", label = "vectorized", marker = "s", markersize='7', elinewidth=2)

plt.errorbar(fortran_result.S2, fortran_result.B2, fortran_result.error, color="green", 
             label = "fortran", marker = "v", markersize='10', elinewidth=2)

plt.legend(loc="lower left")
plt.xlabel("S2")
plt.ylabel("B2")
plt.margins(0.02)
plt.title("B2 vs S2: after 1 minute computation time")
plt.show()

plt.scatter(output_table.S2, (output_table.error/output_table.B2)*100.0, color = "red", 
            label="original", marker = ".", s=120)

plt.scatter(output_table_vec.S2, (output_table_vec.error/output_table_vec.B2)*100.0, 
            color="blue", label = "vectorized", marker = ",", s=60)

plt.scatter(fortran_result.S2, (fortran_result.error/fortran_result.B2)*100.0, 
            color="green", label = "fortran", marker = "v", s=60)

plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.xlabel("S2")
plt.ylabel("error (%)")
plt.margins(0.02)
plt.title("% error in B2 vs S2: after 1 minute computation time")
plt.show()

And finally, we import a Fortran function we made to calculate B2, which we can call from python.

This is identical to the Fortran code we used externally, just adapted to be readible by Python.

import onsager_B2    #import the Fortran module

t0 = time.clock()
output_table_f2py = pd.DataFrame(np.column_stack(onsager_B2.onsager_b2(5.0,1.0, 10000*1200, 21)),columns=['S2','B2', 'error'])
#output_table_vec = output_table_vec[["S2", "B2", "error"]]
print(output_table_f2py)
t1 = time.clock()
print('')
print('CPU time for imported Fortran function:', t1-t0)

      S2         B2     error
0   0.00  37.394208  0.050657
1   0.05  37.432080  0.050750
2   0.10  37.335960  0.050729
3   0.15  37.176408  0.050780
4   0.20  36.933480  0.050503
5   0.25  36.680904  0.050325
6   0.30  36.325224  0.050029
7   0.35  35.924112  0.049892
8   0.40  35.434224  0.049566
9   0.45  34.973136  0.049173
10  0.50  34.308720  0.048584
11  0.55  33.581160  0.048283
12  0.60  32.934744  0.047841
13  0.65  32.082912  0.047029
14  0.70  31.138488  0.046450
15  0.75  30.171888  0.045802
16  0.80  28.918296  0.044765
17  0.85  27.431136  0.043744
18  0.90  25.785360  0.042369
19  0.95  23.481648  0.040535
20  1.00  17.850168  0.035381

CPU time for imported Fortran function: 60.74180999999999

This turns out to be a little slower than the external function, but not by much (a few percent maybe)

Now we can plot all these

fortran_result=pd.read_csv("B2_L5d0_D1d0-race-csv.dat", header=None, names=["S2", "alpha", "B2", "error"])

plt.errorbar(output_table.S2, output_table.B2, output_table.error, color = "red", 
             label="original", marker = ".", markersize='10', elinewidth=2)

plt.errorbar(output_table_vec.S2, output_table_vec.B2, output_table_vec.error, 
             color="blue", label = "vectorized", marker = "s", markersize='7', elinewidth=2)

plt.errorbar(fortran_result.S2, fortran_result.B2, fortran_result.error, color="green", 
             label = "fortran (ext)", marker = "v", markersize='10', elinewidth=2)

plt.errorbar(output_table_f2py.S2, output_table_f2py.B2, output_table_f2py.error, 
             color="cyan", label = "fortran (int)", marker = "*", markersize='10', elinewidth=2)

plt.legend(loc="lower left")
plt.xlabel("S2")
plt.ylabel("B2")
plt.margins(0.02)
plt.title("B2 vs S2: after 1 minute computation time")
plt.show()

plt.scatter(output_table.S2, (output_table.error/output_table.B2)*100.0, color = "red", 
            label="original", marker = ".", s=120)

plt.scatter(output_table_vec.S2, (output_table_vec.error/output_table_vec.B2)*100.0, 
            color="blue", label = "vectorized", marker = ",", s=60)

plt.scatter(fortran_result.S2, (fortran_result.error/fortran_result.B2)*100.0, 
            color="green", label = "fortran (ext)", marker = "v", s=60)

plt.scatter(output_table_f2py.S2, (output_table_f2py.error/output_table_f2py.B2)*100.0, 
            color="cyan", label = "fortran (int)", marker = "*", s=90)

plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.xlabel("S2")
plt.ylabel("error (%)")
plt.margins(0.02)
plt.title("% error in B2 vs S2: after 1 minute computation time")
plt.show()

Matthew Dennison

Contents

Calculating the second virial coefficient ...

... or the importance of vectorizing