Shengbin's Studio.

Optical Character Recognition

2017/12/23

Image Segmentation

1
2
3
4
import cv2
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import numpy as np
1
2
3
4
def showImage(image):
plt.figure(figsize=(8,8))
plt.imshow(image)
plt.show()
1
2
img = cv2.imread('IMG_0377_o.JPG', cv2.IMREAD_COLOR)
showImage(img)

png

Use the Canny Algorithm in OpenCV to extract the edges

Canny algorithm is applied to extracted edges in this image, then the edges can be use to local contours

1
2
3
4
5
6
7
edges = cv2.Canny(img,200,240)
plt.figure(figsize=(16,8))
plt.subplot(121),plt.imshow(img,cmap = 'gray')
plt.title('Original Image'), plt.xticks([]), plt.yticks([])
plt.subplot(122),plt.imshow(edges,cmap = 'gray')
plt.title('Edge Image'), plt.xticks([]), plt.yticks([])
plt.show()

png

Find countours based on the edges extracted from image

Edges found by canny is used to find contours of this images, from the images showed belowed we can see that a lot of contours are extracted from the image.

1
2
3
4
im2, contours, hierarchy = cv2.findContours(edges, cv2.RETR_EXTERNAL , cv2.CHAIN_APPROX_SIMPLE)
imcopy = img.copy()
cv2.drawContours(imcopy, contours, -1, (0,255,0), 10)
showImage(imcopy)

png

Find the Max Contor with largest contour area

For this problem, the white paper sheet in this image has the largest contour, we can extracted the contour of this white paper sheet by finding the largest contour.

1
2
3
4
c = max(contours, key = cv2.contourArea)
imcopy = img.copy()
cv2.drawContours(imcopy, c, -1, (0,255,0), 10)
showImage(imcopy)

png

Approximate the Contor

After finding the contour of this white paper sheet, we can use Geometric shape such as rectangle or polygon to approximate this countour. From the result showed belowed, we can see that polygon did well in the shape approxiamtion.

rectangle approximation
1
2
3
4
imcopy = img.copy()
x,y,w,h = cv2.boundingRect(c)
cv2.rectangle(imcopy,(x,y),(x+w,y+h),(0,255,0),10)
showImage(imcopy)

png

Polygon approximation

By applying polygon approximation, the 4 coner points was extracted, which can be used in the later experiment of perspective transformation.

1
2
3
4
5
6
7
8
9
epsilon = 0.01*cv2.arcLength(c,True)
approx = cv2.approxPolyDP(c,epsilon,True)
imcopy = img.copy()
cv2.drawContours(imcopy, [approx], -1, (0,255,0), 12)
cv2.putText(imcopy, 'Point 1', (approx[0][0][0], approx[0][0][1]),cv2.FONT_HERSHEY_PLAIN, 6,(255,0,0),10, cv2.LINE_AA)
cv2.putText(imcopy, 'Point 2', (approx[1][0][0], approx[1][0][1]),cv2.FONT_HERSHEY_PLAIN, 6,(255,0,0),10, cv2.LINE_AA)
cv2.putText(imcopy, 'Point 3', (approx[2][0][0], approx[2][0][1]),cv2.FONT_HERSHEY_PLAIN, 6,(255,0,0),10, cv2.LINE_AA)
cv2.putText(imcopy, 'Point 4', (approx[3][0][0], approx[3][0][1]),cv2.FONT_HERSHEY_PLAIN, 6,(255,0,0),10, cv2.LINE_AA)
showImage(imcopy)

png

Applied Perspective Transformation

  • Projective transformation(Perspective transformation) is the combination of affine transformation and projective wrap.
    Suppose(x, y, 1) is a point in homogeneous coordinate. The projective transformation of this point is as followed.

This 8 parameters matrix maps point$(x,y,1)$ in one projective to point $(x’/w’,y’/w’,1)$ in another projective.

We can get 2 equations from one point mapping, to solve this 8 parameter tranformation equation, we need more than 4 points mapping. When this tranformation equation be solved, can can applied it to get a new image.

1
2
3
4
5
6
7
8
9
10
imcopy = img.copy()
pts1 = np.float32(approx)
## the size is propotional to a US letter's size 425:550 = 8.5:11
pts2 = np.float32([[0,550],[425,550],[425,0],[0,0]])
M = cv2.getPerspectiveTransform(pts1,pts2)
dst = cv2.warpPerspective(imcopy,M,(425,550))
plt.figure(figsize=(8,16))
plt.subplot(121),plt.imshow(imcopy),plt.title('Input')
plt.subplot(122),plt.imshow(dst),plt.title('Output')
plt.show()

png

Build a CNN model with tensorflow

1
2
3
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
1
2
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets('MNIST_data', one_hot=True)
1
2
3
4
5
6
def weight_variable(shape,name):
initial = tf.truncated_normal(shape, stddev=0.1)
return tf.Variable(initial, name=name)
def bias_variable(shape,name):
initial = tf.constant(0.1, shape=shape)
return tf.Variable(initial, name=name)
1
2
3
4
def conv2d(x, W):
return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')
def max_pool_2x2(x):
return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
1
2
3
4
5
def create_placeholders(n_x=784, n_y=10):
x = tf.placeholder(tf.float32, shape=[None, n_x])
y_ = tf.placeholder(tf.float32, shape=[None, n_y])
keep_prob = tf.placeholder(tf.float32)
return x, y_, keep_prob
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
def initialize_parameters():
## first CNN layer
W_conv1 = weight_variable([5, 5, 1, 32], 'W_conv1')
b_conv1 = bias_variable([32], 'b_conv1')

## second CNN layer
W_conv2 = weight_variable([5, 5, 32, 64], 'W_conv2')
b_conv2 = bias_variable([64], 'b_conv2')

## fully connected layer
W_fc1 = weight_variable([7 * 7 * 64, 1024], 'W_fc1')
b_fc1 = bias_variable([1024], 'b_fc1')

## read out layer
W_fc2 = weight_variable([1024, 10], 'W_fc2')
b_fc2 = bias_variable([10], 'b_fc2')

parameters = {'W_conv1': W_conv1,
'b_conv1': b_conv1,
'W_conv2': W_conv2,
'b_conv2': b_conv2,
'W_fc1': W_fc1,
'b_fc1': b_fc1,
'W_fc2': W_fc2,
'b_fc2': b_fc2
}
return parameters
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
def forward_prop(x, keep_prob, parameters):
W_conv1 = parameters['W_conv1']
b_conv1 = parameters['b_conv1']
W_conv2 = parameters['W_conv2']
b_conv2 = parameters['b_conv2']
W_fc1 = parameters['W_fc1']
b_fc1 = parameters['b_fc1']
W_fc2 = parameters['W_fc2']
b_fc2 = parameters['b_fc2']

x_image = tf.reshape(x, [-1, 28, 28, 1])

h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)
h_pool1 = max_pool_2x2(h_conv1)

h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
h_pool2 = max_pool_2x2(h_conv2)

h_pool2_flat = tf.reshape(h_pool2, [-1, 7*7*64])
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
y_conv = tf.matmul(h_fc1_drop, W_fc2) + b_fc2

return y_conv
1
2
3
4
5
6
tf.reset_default_graph()
with tf.Session() as sess:
x, y_, keep_prob = create_placeholders();
parameters = initialize_parameters()
y_conv = forward_prop(x, 1.0, parameters)
print("y_conv = " + str(y_conv))
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
num_epochs = 16
learning_rate = 1e-4
costs = []
x, y_, keep_prob = create_placeholders()
parameters = initialize_parameters()
y_conv = forward_prop(x, keep_prob, parameters)

cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y_conv))
## Optimizer
train_step = tf.train.AdamOptimizer(learning_rate).minimize(cross_entropy)
correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
init = tf.global_variables_initializer()

with tf.Session() as sess:
sess.run(init)

for epoch in range(num_epochs):
epoch_cost = 0.0
num_minibatches = int(55000/50)

for i in range(1100):
batch = mnist.train.next_batch(50)

if i % 100 == 0:
train_accuracy = accuracy.eval(feed_dict={x: batch[0], y_: batch[1],keep_prob:1.0})
print('epoch %d, step %d, training accuracy %g' % (epoch, i, train_accuracy))

_, minibatch_cost = sess.run([train_step, cross_entropy],feed_dict={x: batch[0], y_: batch[1], keep_prob: 0.5})
epoch_cost += minibatch_cost / num_minibatches

if epoch % 2 == 0:
print ("Cost after epoch %i: %f" % (epoch, epoch_cost))
costs.append(epoch_cost)

plt.plot(np.squeeze(costs))
plt.ylabel('cost')
plt.xlabel('iterations per epoch')
plt.title("Learning rate =" + str(learning_rate))
plt.show()

parameters = sess.run(parameters)
print ("Parameters have been trained!")

print('test accuracy %g' % accuracy.eval(feed_dict={x: mnist.test.images, y_: mnist.test.labels, keep_prob: 1.0}))

png

Parameters have been trained!
test accuracy 0.9927

Save the parameters to local data

Because the CNN model takes a long long time to train, thus it will save time if we can save the trained parameters to a local file

1
2
import pickle
pickle.dump(parameters, open("params.pkl", "wb"))

Load data from saved file

1
testparams = pickle.load(open("params.pkl","rb"))
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
def predict(x, parameters):
W_conv1 = tf.convert_to_tensor(parameters['W_conv1'])
b_conv1 = tf.convert_to_tensor(parameters['b_conv1'])
W_conv2 = tf.convert_to_tensor(parameters['W_conv2'])
b_conv2 = tf.convert_to_tensor(parameters['b_conv2'])
W_fc1 = tf.convert_to_tensor(parameters['W_fc1'])
b_fc1 = tf.convert_to_tensor(parameters['b_fc1'])
W_fc2 = tf.convert_to_tensor(parameters['W_fc2'])
b_fc2 = tf.convert_to_tensor(parameters['b_fc2'])

parameters = {'W_conv1': W_conv1,
'b_conv1': b_conv1,
'W_conv2': W_conv2,
'b_conv2': b_conv2,
'W_fc1': W_fc1,
'b_fc1': b_fc1,
'W_fc2': W_fc2,
'b_fc2': b_fc2
}

x_input = tf.placeholder(tf.float32, shape=[None, 784])

y_out = forward_propagation_for_predict(x_input, parameters)
p = tf.argmax(y_out, 1)

sess = tf.Session()
prediction = sess.run(p, feed_dict = {x_input: x})
return prediction
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
def forward_propagation_for_predict(x, parameters):
W_conv1 = parameters['W_conv1']
b_conv1 = parameters['b_conv1']
W_conv2 = parameters['W_conv2']
b_conv2 = parameters['b_conv2']
W_fc1 = parameters['W_fc1']
b_fc1 = parameters['b_fc1']
W_fc2 = parameters['W_fc2']
b_fc2 = parameters['b_fc2']

x_image = tf.reshape(x, [-1, 28, 28, 1])

h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)
h_pool1 = max_pool_2x2(h_conv1)

h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
h_pool2 = max_pool_2x2(h_conv2)

h_pool2_flat = tf.reshape(h_pool2, [-1, 7*7*64])
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)
h_fc1_drop = tf.nn.dropout(h_fc1, 1.0)
y_conv = tf.matmul(h_fc1_drop, W_fc2) + b_fc2

return y_conv

Experiment of Self-Written Digits Image

1
import cv2
1
2
3
img = cv2.imread('test_digit4.png', cv2.IMREAD_COLOR)
img = cv2.cvtColor( img, cv2.COLOR_RGB2GRAY )
im_gray = cv2.GaussianBlur(img, (5, 5), 0)
1
2
3
plt.figure(figsize=(10,5))
plt.imshow(im_gray, cmap = 'gray', interpolation = 'bicubic')
plt.show()

png

1
blur_img = cv2.GaussianBlur(img, (5,5), 0)
1
2
3
ret,thresh = cv2.threshold(img,127,255,cv2.THRESH_BINARY_INV)
im2, contours, hierarchy = cv2.findContours(thresh, cv2.RETR_EXTERNAL , cv2.CHAIN_APPROX_SIMPLE)
rects = [cv2.boundingRect(contour) for contour in contours]
1
2
plt.imshow(thresh, cmap = 'gray', interpolation = 'bicubic')
plt.show()

png

1
2
3
4
5
6
7
8
9
10
11
12
13
14
def generateImage(small):
row, col = small.shape

small = cv2.resize(small, (20*col//row, 20), interpolation=cv2.INTER_AREA)
##small = cv2.dilate(small, (3, 3))
small = cv2.dilate(small, (3, 3))
blank_image = np.zeros((28,28), np.uint8)
row, col = small.shape
shift_x = (28 - row) // 2
shift_y = (28 - col) // 2
for i in range(0,row):
for j in range(0,col):
blank_image[i+shift_x][j+shift_y] = small[i][j]
return blank_image
1
2
3
4
5
6
7
8
9
10
11
12
13
14
images = []
for i, rect in enumerate(rects):
# Draw the rectangles
cv2.rectangle(img, (rect[0], rect[1]), (rect[0] + rect[2], rect[1] + rect[3]), (0, 255, 0), 3)
# Make the rectangular region around the digit
width = int(rect[3] * 1.1)
height = int(rect[2] * 1.1)
pt1 = int(rect[1] + rect[3] // 2 - width // 2)
pt2 = int(rect[0] + rect[2] // 2 - height // 2)
roi = thresh[pt1:pt1+width, pt2:pt2+height]
# Resize the image
roi = generateImage(roi)
# dilate the image
images.append(roi)
1
2
3
4
5
6
7
fig=plt.figure(figsize=(12,4))
for i, image in enumerate(images):
#image = cv2.bitwise_not(image)
sub_fig = fig.add_subplot(2,5,i+1)
sub_fig.imshow(image, cmap = 'gray', interpolation = 'bicubic')
images[i] = image
plt.show()

png

Test my Own hand written

1
2
3
4
5
6
7
8
9
fig_out=plt.figure(figsize=(12,6))
for i, image in enumerate(images):
test_im = np.array([image.reshape(28*28)], 'float32')
my_image_prediction=predict(test_im, testparams)
label = np.squeeze(my_image_prediction)
sub_fig = fig_out.add_subplot(2,5,i+1)
sub_fig.annotate(label, xy=(2, 1),size= 25,color='#ee8d18', xytext=(4, 4))
sub_fig.imshow(image.reshape(28,28), cmap = 'gray', interpolation = 'bicubic')
plt.show()

png

CATALOG
  1. 1. Image Segmentation
    1. 1.1. Use the Canny Algorithm in OpenCV to extract the edges
    2. 1.2. Find countours based on the edges extracted from image
    3. 1.3. Find the Max Contor with largest contour area
    4. 1.4. Approximate the Contor
      1. 1.4.1. rectangle approximation
      2. 1.4.2. Polygon approximation
    5. 1.5. Applied Perspective Transformation
  2. 2. Build a CNN model with tensorflow
    1. 2.1. Save the parameters to local data
    2. 2.2. Load data from saved file
    3. 2.3. Experiment of Self-Written Digits Image
    4. 2.4. Test my Own hand written