Numpy

HaiyueAugust 26, 2023About 4 min

Instruction

Fast vectorised array operations for data munging and cleaning, subsetting and filtering, transformation, and any other kinds of computations.

Efficient descriptive statistics and aggregating/summarising data.

Data alignment and relational data manipulations for merging and joining together heterogeneous data sets.

Expressing conditional logic as array expressions instead of loops with if-elif-else branches

Groupwise data manipulations (aggregation, transformation, function application) – useful for pandas!

Simple code

import numpy as np

data = np.array(
  [[5,-10,2], 
   [4, 3, 9]])

type(data)
data

data * 10
data * data

dir(data)

data.size
data.ndim
data.shape
data.dtype

Create numpy array


np.zeros(10)

np.zeros((3,5,3))
np.ones((3,6))

np.empty((2,3))

np.arange(15)

a = np.zeros((3,6))
np.ones_like(a)

np.ones(a.shape)

np.eye(4)

numpy data types and conversions

a = np.array([1.1,2.2,3.3,4,5])
a.dtype

b = a.astype(np.int64)
b.dtype

c = np.array(["aaaaa",2,3,4,4], dtype=np.string_)
c.dtype

c.astype(float)

int_array = np.arange(10)
float_array = np.array([.2, .27, .357])
int_array.astype(float_array.dtype)

Slicing and Indexing

Slicing in 1 D data

arr = np.arange(10)
print(arr)

arr[5]

arr[5:8] = 99
print(arr)

# Array slice is a view of the origianl array
arr = np.arange(10)
arr
brr = arr[5:8]
brr

arr[5:8] = 99
arr
brr

Indexing and Slicing in 2D

arr2d = np.array([[1, 2, 3], [4, 5, 6],[7, 8, 9]])
arr2d.shape
arr2d
arr2d[1][0]        # the same as below
arr2d[1,0]

arr2d[0]           # you can skip later indecis (columns)
arr2d[:1]

arr2d[:, 1]        # for column, you have to mention first index
arr2d[1]

arr2d[:2, 1:]
arr2d[2, 1]

arr2d[1, :2]
arr2d[1:3, :2]

arr2d[1, :2].ndim
arr2d[1:3, :2].ndim

arr2d[1, :2].shape

a = arr2d[1, :2]
a.shape

arr2d[1:2, :2].shape

more than 2D

Indexing and Slicing in nD, n > 2

arr1d = np.arange(12)
arr3d = arr1d.reshape(2,3,2)

arr3d[0]
arr3d[0,0]
arr3d[0,0] = 99

arr3d[:,1]

Boolean indexing

names = np.array(['Belinda', 'Malgorzata', 'John', 'Belinda', 'John', 'Jasper', 'Jasper'])
colours = np.array(['green','red','blue','yellow','brown','green','purple'])

names == 'Belinda'
colours[names == 'Belinda']

colours2 = np.array([['green','red','blue','yellow','brown','green','purple'],
                     ['black','pink','pink','brown','white','red','orange']])

colours2[0, names == 'Belinda']
colours2[1, names == 'Belinda']
colours2[:, names != 'Belinda']
colours2[:, ~(names == 'Belinda')]

### We can combine using & and |
mask = (names == 'Belinda') | (names == 'Jasper')
colours2[:, mask]


data = np.array([-2,3,1,-4,-12,-3,19])
data

data[data < 0] = 0
data

Facy Indexing

Fancy Indexing - using integer array as index
Fancy indexing copies the data into a new array

x = range(8)
x = list(x)
x = x * 4

arr = np.array(x).reshape(4,8).T
arr[[4, 3, 0, 6]]
arr[[-3, -5, -7]]


arr = np.arange(32).reshape((8, 4))
arr

arr[[1, 5, 7, 2], [0, 3, 1, 2]]         # we got only for numbers 

arr[[1, 5, 7, 2]][:, [0, 3]]      # for 4 by 4 array

### ix_() as an alternative
arr[np.ix_([1, 5, 7, 2], [0, 3])]

Some numpy methods

Reshape

arr = np.arange(15).reshape((3, 5))
arr

Transpose

arr.T

Random numbers

arr = np.random.randn(6, 3)
arr

Dot product

np.dot(arr.T, arr)

high dimentional array

arr = np.arange(16).reshape((2, 2, 4))
arr

transpose

arr.transpose((1, 0, 2))

# swapaxes 
arr.swapaxes(1, 0)

unary ufuncs

Universal functions (ufuncs) - element-wise transformation

arr = np.arange(1,6)

np.square(arr)
np.sqrt(arr)
np.log(arr)


# binary ufunc
x = y = np.random.randn(8)
np.add(x, y)                  
np.multiply(x, y)
np.divide(x, y)


# return of multiple arrays
arr = np.random.randn(7) * 10

np.modf(arr)

Vectorisation

points = np.arange(-5, 5, 0.01) # 1000 equally spaced points
xs, ys = np.meshgrid(points, points)
ys

z = np.sqrt(xs ** 2 + ys ** 2)
z


import matplotlib.pyplot as plt
plt.imshow(z)
plt.colorbar()
plt.title("Image plot of $\sqrt{x^2 + y^2}$")

Conditional logic

xarr = np.array([1.1, 1.2, 1.3, 1.4, 1.5])
yarr = np.array([2.1, 2.2, 2.3, 2.4, 2.5])
cond = np.array([True, False, True, True, False])

result = [(x if c else y) for x, y, c in zip(xarr, yarr, cond)]
result

result = np.where(cond, xarr, yarr)
result


arr = np.random.randn(4,4)
arr

np.where(arr > 0, 2, -2)

np.sign(arr) * 2

# Another example

cond1 = np.array([True, True, False, False])
cond2 = np.array([True, False, True, False])
n = len(cond1)
result = []
for i in range(n):
    if cond1[i] and cond2[i]:
        result.append(0)
    elif cond1[i]: 
        result.append(1)
    elif cond2[i]: 
        result.append(2)
    else:
        result.append(3)
result


np.where(cond1 & cond2, 0, np.where(cond1, 1, np.where(cond2, 2, 3)))

Array aggregations

arr = np.random.randn(5, 4) 
arr

arr.mean(axis=0)                  # Column means
arr.std(axis=0)                   # Column standard deviation
arr.median(axis=0)                # Column medians

np.mean(arr, axis=0)              # Column means
np.std(arr, axis=0)               # Column standard deviation
np.median(arr, axis=0)            # Column medians


arr = np.random.randn(100)
(arr > 0).sum()                   # Number of positive values
(arr > 0).mean() == 0.48

Boolean arrays

bools = np.array([False, False, True, False])
bools.any() 
bools.all()

Input and Output

Binary File

arr = np.arange(10)
np.save('some_array.npy', arr)             # uncompressed binary

np.load('some_array.npy')


np.savez('array_archive.npz', a=arr, b=arr*2)

arch = np.load('array_archive.npz')        # zip archive
arch['b']
arch['a']

Text File

# more on reading files in pandas
arr = np.arange(12).reshape(4,3)
np.savetxt('array_ex.txt', arr, delimiter='==')

del arr
arr = np.loadtxt('array_ex.txt', delimiter='==')

arr = np.genfromtxt('array_ex.txt', delimiter='==')

Random Number Generation

generate

arr = np.random.normal(size=100000)

dir(np.random)

Simple random walk

import random
steps = 1000
position = 0
walk = [position]
for i in range(steps):
    step = 1 if random.randint(0, 1) else -1
    position += step
    walk.append(position)

import matplotlib.pyplot as plt
# %matplotlib inline

plt.plot(walk)
plt.title("Simple random walk with +1/-1 steps")
# plt.show()


steps = 1000
draws = np.random.choice([-1,1], size=steps)
walk = draws.cumsum()

import matplotlib.pyplot as plt
# %matplotlib inline

plt.plot(walk)
plt.title("Simple random walk with +1/-1 steps")

Simulating many random walks

nwalks = 5000
nsteps = 1000
draws = np.random.choice([-1,1], size=(nwalks, nsteps))
walks = draws.cumsum(1)

walks.max()
walks.min()

hits30 = (np.abs(walks) >= 30).any(1)
hits30.sum()

crossing_times = (np.abs(walks[hits30]) >= 30).argmax(1)
crossing_times.mean()

import matplotlib.pyplot as plt
# %matplotlib inline

plt.plot(walks.T)
plt.title(str(nwalks) + " simple random walks with +1/-1 steps")

plt.hist(walks[:,999], bins=53)  # 
plt.title("Distribution for the very last step")

References