# Easy(ish) notebook-based inner-loop optimization with C

In [1]:
import numpy as np

## Basic Python 

In [2]:
def py_addmul(inarr, tomul, toadd):
    out = [val*tomul + toadd for val in inarr]
    return np.array(out)

def np_addmul(inarr, tomul, toadd):
    return inarr*tomul + toadd     

In [3]:
py_addmul(np.arange(10, dtype=float), 2, 1.2)

array([  1.2,   3.2,   5.2,   7.2,   9.2,  11.2,  13.2,  15.2,  17.2,  19.2])

In [4]:
np_addmul(np.arange(10, dtype=float), 2, 1.2)

array([  1.2,   3.2,   5.2,   7.2,   9.2,  11.2,  13.2,  15.2,  17.2,  19.2])

## Cython 

In [5]:
import Cython
%load_ext Cython

In [6]:
%%cython

import numpy as np

def fast_addmul(inarr, tomul, toadd):
    outarr = np.empty_like(inarr)
    cdef double[:] inarr_memview = inarr
    
    c_fast_addmul(inarr_memview, outarr, len(inarr), toadd, tomul)
    
    return outarr

cdef c_fast_addmul(double[:] inarr, double[:] outarr, int arrlen, double toadd, double tomul):
    cdef int i
    for i in range(arrlen):
        outarr[i] = inarr[i]*tomul + toadd

In [7]:
fast_addmul(np.arange(10, dtype=float), 2, 1.2)

array([  1.2,   3.2,   5.2,   7.2,   9.2,  11.2,  13.2,  15.2,  17.2,  19.2])

In [8]:
n = 100

arr = np.arange(n, dtype=float)


%timeit py_addmul(arr, 1.2, 2)
%timeit np_addmul(arr, 1.2, 2)
%timeit fast_addmul(arr, 1.2, 2)
None

41 µs ± 1.69 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
2.53 µs ± 94.2 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
2.08 µs ± 45.9 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [9]:
n = 1000000

arr = np.arange(n, dtype=float)

%timeit py_addmul(arr, 1.2, 2)
%timeit np_addmul(arr, 1.2, 2)
%timeit fast_addmul(arr, 1.2, 2)
None

423 ms ± 29.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
3.15 ms ± 80.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
2.48 ms ± 40.9 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


### Optimize 

In [10]:
%%cython -a

import numpy as np

def fast_addmul(inarr, tomul, toadd):
    outarr = np.empty_like(inarr)
    cdef double[:] inarr_memview = inarr
    
    c_fast_addmul(inarr_memview, outarr, len(inarr), toadd, tomul)
    
    return outarr

cdef c_fast_addmul(double[:] inarr, double[:] outarr, int arrlen, double toadd, double tomul):
    cdef int i
    for i in range(arrlen):
        outarr[i] = inarr[i]*tomul + toadd

In [11]:
%%cython -a

import cython
import numpy as np

def fast_addmul_opt(inarr, tomul, toadd):
    outarr = np.empty_like(inarr)
    cdef double[:] inarr_memview = inarr
    
    c_fast_addmul_opt(inarr_memview, outarr, len(inarr), toadd, tomul)
    
    return outarr

@cython.boundscheck(False)
cdef c_fast_addmul_opt(double[:] inarr, double[:] outarr, int arrlen, double toadd, double tomul):
    cdef int i
    for i in range(arrlen):
        outarr[i] = inarr[i]*tomul + toadd

In [12]:
n = 500000

arr = np.arange(n, dtype=float)

%timeit fast_addmul(arr, 1.2, 2)
%timeit fast_addmul_opt(arr, 1.2, 2)
None

474 µs ± 20.8 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
365 µs ± 39.4 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


# cffi in "inline" mode 

In [13]:
from cffi import FFI

In [14]:
ffibuilder = FFI()

ffibuilder.cdef("void cffi_addmul(double *, double *, double add, double mul, int arrlen);")

ffibuilder.set_source("_cffi_addmul_mod",
r"""
void cffi_addmul(double * a, double * b, double add, double mul, int arrlen) {
        for (int i=0;i<arrlen;i++){
            b[i] = a[i]*mul+add;
    }
}
""")

ffibuilder.compile(verbose=True)

generating ./_cffi_addmul_mod.c
(already up-to-date)
running build_ext
building '_cffi_addmul_mod' extension
clang -Wno-unused-result -Wsign-compare -Wunreachable-code -DNDEBUG -g -fwrapv -O3 -Wall -Wstrict-prototypes -I/Users/erik/miniconda3/include -arch x86_64 -I/Users/erik/miniconda3/include/python3.5m -c _cffi_addmul_mod.c -o ./_cffi_addmul_mod.o
clang -bundle -undefined dynamic_lookup -L/Users/erik/miniconda3/lib -arch x86_64 ./_cffi_addmul_mod.o -L/Users/erik/miniconda3/lib -o ./_cffi_addmul_mod.cpython-35m-darwin.so


'/Users/erik/tmp/_cffi_addmul_mod.cpython-35m-darwin.so'

In [15]:
from _cffi_addmul_mod import ffi, lib

In [16]:
def cffi_addmul(inarr, toadd, tomul):
    outarr = np.empty_like(inarr)
    
    inptr = ffi.cast("double *", ffi.from_buffer(inarr))
    outptr = ffi.cast("double *", ffi.from_buffer(outarr))
    
    lib.cffi_addmul(inptr, outptr, toadd, tomul, len(inarr))
    
    return outarr

In [17]:
cffi_addmul(np.arange(10, dtype=float), 1.2, 2)

array([  1.2,   3.2,   5.2,   7.2,   9.2,  11.2,  13.2,  15.2,  17.2,  19.2])

In [18]:
n = 500000

arr = np.arange(n, dtype=float)

%timeit fast_addmul(arr, 1.2, 2)
%timeit fast_addmul_opt(arr, 1.2, 2)
%timeit cffi_addmul(arr, 1.2, 2)
None

468 µs ± 7.51 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
400 µs ± 26.4 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
338 µs ± 25 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [21]:
n = 500000

arr = np.arange(n, dtype=float)

%timeit py_addmul(arr, 1.2, 2)
%timeit np_addmul(arr, 1.2, 2)
%timeit fast_addmul(arr, 1.2, 2)
%timeit fast_addmul_opt(arr, 1.2, 2)
%timeit cffi_addmul(arr, 1.2, 2)
None

202 ms ± 9.11 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
692 µs ± 5.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
470 µs ± 7.27 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
347 µs ± 7.75 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
314 µs ± 10.4 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [28]:
us = [202000,
      692,
      470,
      347,
      314]
us = np.array(us)
us/us[-1], us/us[1], us/us[0]

(array([ 643.31210191,    2.20382166,    1.49681529,    1.10509554,    1.        ]),
 array([ 291.90751445,    1.        ,    0.67919075,    0.50144509,
           0.45375723]),
 array([ 1.        ,  0.00342574,  0.00232673,  0.00171782,  0.00155446]))

### DOOM! (maybe?)

In [20]:
cffi_addmul(np.arange(10, dtype='float32'), 1.2, 2)

array([  4.17232506e-08,   1.90195310e+00,   3.15544367e-31,
         3.25468755e+00,   2.83953730e+31,   5.50014639e+00,
         3.85365778e+31,   7.50000906e+00,  -3.19744223e-15,
         1.00000019e+01], dtype=float32)