Check for a reconstruction of maximal base pair matching
packages = ["numpy", "matplotlib"]
terminal = false
def full_nussinov():
# construct a dynamic programming table
# from the input sequence to be used for backtracking
def build_table(seq, pairs, hairpin):
m = len(seq)
# initialize empty array
dp_table = np.zeros((m, m))
for h in range(1 + hairpin, m):
# we only care about upper triangle
# there is no need to recurse through zeros in bottom half
for i in range(m - h):
j = i + h
opt = -1
# case 1: if matches
if (seq[i], seq[j]) in pairs:
curr_opt = dp_table[i+1][j-1] + 1
if curr_opt > opt:
opt = curr_opt
# case 2: skip head
curr_opt = dp_table[i+1][j]
if curr_opt > opt:
opt = curr_opt
# case 3: skip tail
curr_opt = dp_table[i][j-1]
if curr_opt > opt:
opt = curr_opt
# case 4: bifurcation
for k in range(i+1+hairpin, j):
curr_opt = dp_table[i][k] + dp_table[k+1][j]
if curr_opt > opt:
opt = curr_opt
# set table cell equal to best value
dp_table[i][j] = opt
return dp_table
# takes in the completed table and traces from the top-right
# until it reaches the top-left to bottom-right boundary diagonal
def traceback(seq, dp_table, hairpin):
n = len(seq)
# intial starting point from the top-right
stack = [(0,n-1)]
pairs = []
while len(stack) > 0:
i,j = stack.pop()
if i + hairpin >= j:
continue
elif dp_table[i+1][j] == dp_table[i][j]:
stack.append((i+1, j))
elif dp_table[i][j-1] == dp_table[i][j]:
stack.append((i, j-1))
elif dp_table[i+1][j-1] +1 == dp_table[i][j]:
pairs.append((i, j))
stack.append((i+1, j-1))
else:
for k in range(i+1, j):
if dp_table[i][k] + dp_table[k+1][j] == dp_table[i][j]:
stack.append((k+1, j))
stack.append((i, k))
break
return pairs
# bracket notation
# using dashes instead of periods for easier visual distinguishing
def bracket_conversion(seq, pairs):
notation = ['-' for c in seq]
for (i, j) in pairs:
notation[i] = '('
notation[j] = ')'
return ''.join(notation)
def visualize_reconstruction(seq, table, record, hairpin):
'''
Creating backtrace from the given DP table and reconstruction
Reference for plotting:
https://matplotlib.org/3.1.1/gallery/images_contours_and_fields/image_annotated_heatmap.html
'''
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
pairs = set(record)
left_p = set([r[0] for r in record])
right_p = set([r[1] for r in record])
n = len(seq)
visited = np.zeros((n, n))
arrows = [] # quadruples of (x, y, dx, dy)
v_mark = 1
stack = [(0,n-1)]
visited[0, n-1] = v_mark
while len(stack) > 0:
v_mark += 1
i,j = stack.pop()
if i + hairpin >= j:
continue
elif (i, j) in pairs:
# arrow to (i+1, j-1)
arrows.append((i, j, 1, -1))
visited[i+1, j-1] = v_mark
stack.append((i+1, j-1))
elif i in left_p and j in right_p:
for k in range(i+1 + hairpin, j):
if (i, k) in pairs:
# draw arrow to (i, k)
arrows.append((i, j, 0, k-j))
visited[i, k] = v_mark
stack.append((i, k))
# arrow to (k+1, j)
arrows.append((i, j, k+1-i, 0))
visited[k+1, j] = v_mark
stack.append((k+1, j))
break
elif i in left_p:
# arrow to (i, j-1)
arrows.append((i, j, 0, -1))
visited[i, j-1] = v_mark
stack.append((i, j-1))
else:
# arrow to (i+1, j)
arrows.append((i, j, 1, 0))
visited[i+1, j] = v_mark
stack.append((i+1, j))
rep = bracket_conversion(seq, record)
fig = plt.figure(figsize =(5, 5))
ax = plt.gca()
# revert the color for visited entries
visited[visited > 0] = visited.max() + 1 - visited[visited>0]
im = ax.imshow(visited, cmap='GnBu')
threshold = im.norm(im.get_array().max())/2.
table = np.array(table)
ax.set_xticks(np.arange(n))
ax.set_xticklabels([rep[i] + '\n' + seq[i] for i in range(n)])
ax.set_yticks(np.arange(n))
ax.set_yticklabels([seq[i] for i in range(n)])
ax.tick_params(top=True, bottom=False, labeltop=True, labelbottom=False)
plt.setp(ax.get_xticklabels(), rotation=0, ha="right",
rotation_mode="anchor")
# Turn spines off and create white grid.
for edge, spine in ax.spines.items():
spine.set_visible(False)
ax.set_xticks(np.arange(n+1)-.5, minor=True)
ax.set_yticks(np.arange(n+1)-.5, minor=True)
ax.grid(which="minor", color="w", linestyle='-', linewidth=3, zorder=0)
ax.tick_params(which="minor", bottom=False, left=False)
for i in range(n):
for j in range(n):
text = ax.text(j, i, table[i, j],
ha='center', va='center',
color='w' if im.norm(visited[i, j])>threshold else 'black')
for (x, y, dx, dy) in arrows:
if dy > 0:
dy -= 0.5
elif dy < 0:
dy += 0.5
if dx > 0:
dx -= 0.5
elif dx < 0:
dx += 0.5
ax.arrow(y, x, dy, dx, width=0.01, head_width=0.2, alpha=0.5,
color='r', zorder=2)
display(plt, target="matplotlib-plot", append=False)
plt.close()
return
import numpy as np
import matplotlib.pyplot as plt
from pyscript import Element
tuple_set = set([('A', 'U'), ('U', 'A'), ('C', 'G'), ('G', 'C'), ('G', 'U'), ('U', 'G')])
hairpin_input = Element('hairpin-input').element.value
if (len(hairpin_input) == 0):
hairpin = 0
else:
hairpin = int(hairpin_input)
rna_seq = Element('seq-input').element.value
if (len(rna_seq) == 0):
rna_seq = 'AUAUCG'
rna_seq = rna_seq.upper()
dp_table = build_table(rna_seq, tuple_set, hairpin)
connected_pairs = traceback(rna_seq, dp_table, hairpin)
solution = bracket_conversion(rna_seq, connected_pairs)
display(rna_seq, target="rna_sequence", append=False)
display(solution, target="bracket_solution", append=False)
visualize_reconstruction(rna_seq, dp_table, connected_pairs, hairpin)
Motivations
As we seek to uncover how protiens fold in the tertiary and quarternary structure, we must first look into how the structure appears in the two-dimensions. Nussinov algorithm is a technique which maximizes the number of base pair matchings. This is a rather naive method of generating a secondary structure, but is motivated by the belief that more matchings is indicative of stronger structural stability. While the results are often not biologically relevant, Nussinov provides a first step into looking at how simple dynamic programming algorithms can be extended for other methods such as energy minimization for structural prediction.
Note: PyScript (Python) is slow, give the page a couple seconds to load properly