RNA Structure Prediction

Nussinov Structure Prediction
Please type in your sequence. *Note: Please limit your inputs to the letters A, C, G, U
Please specify the number of hairpin loops
Check for a reconstruction of maximal base pair matching
packages = ["numpy", "matplotlib"] terminal = false def full_nussinov(): # construct a dynamic programming table # from the input sequence to be used for backtracking def build_table(seq, pairs, hairpin): m = len(seq) # initialize empty array dp_table = np.zeros((m, m)) for h in range(1 + hairpin, m): # we only care about upper triangle # there is no need to recurse through zeros in bottom half for i in range(m - h): j = i + h opt = -1 # case 1: if matches if (seq[i], seq[j]) in pairs: curr_opt = dp_table[i+1][j-1] + 1 if curr_opt > opt: opt = curr_opt # case 2: skip head curr_opt = dp_table[i+1][j] if curr_opt > opt: opt = curr_opt # case 3: skip tail curr_opt = dp_table[i][j-1] if curr_opt > opt: opt = curr_opt # case 4: bifurcation for k in range(i+1+hairpin, j): curr_opt = dp_table[i][k] + dp_table[k+1][j] if curr_opt > opt: opt = curr_opt # set table cell equal to best value dp_table[i][j] = opt return dp_table # takes in the completed table and traces from the top-right # until it reaches the top-left to bottom-right boundary diagonal def traceback(seq, dp_table, hairpin): n = len(seq) # intial starting point from the top-right stack = [(0,n-1)] pairs = [] while len(stack) > 0: i,j = stack.pop() if i + hairpin >= j: continue elif dp_table[i+1][j] == dp_table[i][j]: stack.append((i+1, j)) elif dp_table[i][j-1] == dp_table[i][j]: stack.append((i, j-1)) elif dp_table[i+1][j-1] +1 == dp_table[i][j]: pairs.append((i, j)) stack.append((i+1, j-1)) else: for k in range(i+1, j): if dp_table[i][k] + dp_table[k+1][j] == dp_table[i][j]: stack.append((k+1, j)) stack.append((i, k)) break return pairs # bracket notation # using dashes instead of periods for easier visual distinguishing def bracket_conversion(seq, pairs): notation = ['-' for c in seq] for (i, j) in pairs: notation[i] = '(' notation[j] = ')' return ''.join(notation) def visualize_reconstruction(seq, table, record, hairpin): ''' Creating backtrace from the given DP table and reconstruction Reference for plotting: https://matplotlib.org/3.1.1/gallery/images_contours_and_fields/image_annotated_heatmap.html ''' import matplotlib import matplotlib.pyplot as plt import numpy as np pairs = set(record) left_p = set([r[0] for r in record]) right_p = set([r[1] for r in record]) n = len(seq) visited = np.zeros((n, n)) arrows = [] # quadruples of (x, y, dx, dy) v_mark = 1 stack = [(0,n-1)] visited[0, n-1] = v_mark while len(stack) > 0: v_mark += 1 i,j = stack.pop() if i + hairpin >= j: continue elif (i, j) in pairs: # arrow to (i+1, j-1) arrows.append((i, j, 1, -1)) visited[i+1, j-1] = v_mark stack.append((i+1, j-1)) elif i in left_p and j in right_p: for k in range(i+1 + hairpin, j): if (i, k) in pairs: # draw arrow to (i, k) arrows.append((i, j, 0, k-j)) visited[i, k] = v_mark stack.append((i, k)) # arrow to (k+1, j) arrows.append((i, j, k+1-i, 0)) visited[k+1, j] = v_mark stack.append((k+1, j)) break elif i in left_p: # arrow to (i, j-1) arrows.append((i, j, 0, -1)) visited[i, j-1] = v_mark stack.append((i, j-1)) else: # arrow to (i+1, j) arrows.append((i, j, 1, 0)) visited[i+1, j] = v_mark stack.append((i+1, j)) rep = bracket_conversion(seq, record) fig = plt.figure(figsize =(5, 5)) ax = plt.gca() # revert the color for visited entries visited[visited > 0] = visited.max() + 1 - visited[visited>0] im = ax.imshow(visited, cmap='GnBu') threshold = im.norm(im.get_array().max())/2. table = np.array(table) ax.set_xticks(np.arange(n)) ax.set_xticklabels([rep[i] + '\n' + seq[i] for i in range(n)]) ax.set_yticks(np.arange(n)) ax.set_yticklabels([seq[i] for i in range(n)]) ax.tick_params(top=True, bottom=False, labeltop=True, labelbottom=False) plt.setp(ax.get_xticklabels(), rotation=0, ha="right", rotation_mode="anchor") # Turn spines off and create white grid. for edge, spine in ax.spines.items(): spine.set_visible(False) ax.set_xticks(np.arange(n+1)-.5, minor=True) ax.set_yticks(np.arange(n+1)-.5, minor=True) ax.grid(which="minor", color="w", linestyle='-', linewidth=3, zorder=0) ax.tick_params(which="minor", bottom=False, left=False) for i in range(n): for j in range(n): text = ax.text(j, i, table[i, j], ha='center', va='center', color='w' if im.norm(visited[i, j])>threshold else 'black') for (x, y, dx, dy) in arrows: if dy > 0: dy -= 0.5 elif dy < 0: dy += 0.5 if dx > 0: dx -= 0.5 elif dx < 0: dx += 0.5 ax.arrow(y, x, dy, dx, width=0.01, head_width=0.2, alpha=0.5, color='r', zorder=2) display(plt, target="matplotlib-plot", append=False) plt.close() return import numpy as np import matplotlib.pyplot as plt from pyscript import Element tuple_set = set([('A', 'U'), ('U', 'A'), ('C', 'G'), ('G', 'C'), ('G', 'U'), ('U', 'G')]) hairpin_input = Element('hairpin-input').element.value if (len(hairpin_input) == 0): hairpin = 0 else: hairpin = int(hairpin_input) rna_seq = Element('seq-input').element.value if (len(rna_seq) == 0): rna_seq = 'AUAUCG' rna_seq = rna_seq.upper() dp_table = build_table(rna_seq, tuple_set, hairpin) connected_pairs = traceback(rna_seq, dp_table, hairpin) solution = bracket_conversion(rna_seq, connected_pairs) display(rna_seq, target="rna_sequence", append=False) display(solution, target="bracket_solution", append=False) visualize_reconstruction(rna_seq, dp_table, connected_pairs, hairpin)

Motivations

As we seek to uncover how protiens fold in the tertiary and quarternary structure, we must first look into how the structure appears in the two-dimensions. Nussinov algorithm is a technique which maximizes the number of base pair matchings. This is a rather naive method of generating a secondary structure, but is motivated by the belief that more matchings is indicative of stronger structural stability. While the results are often not biologically relevant, Nussinov provides a first step into looking at how simple dynamic programming algorithms can be extended for other methods such as energy minimization for structural prediction.

Note: PyScript (Python) is slow, give the page a couple seconds to load properly