In a practical, we saw the scs function (copied below along with overlap) for finding the shortest common superstring of a set of strings.
def overlap(a, b, min_length=3):
""" Return length of longest suffix of 'a' matching
a prefix of 'b' that is at least 'min_length'
characters long. If no such overlap exists,
return 0. """
start = 0 # start all the way at the left
while True:
start = a.find(b[:min_length], start) # look for b's suffx in a
if start == -1: # no more occurrences to right
return 0
# found occurrence; check for full suffix/prefix match
if b.startswith(a[start:]):
return len(a)-start
start += 1 # move just past previous match
import itertools
def scs(ss):
""" Returns shortest common superstring of given
strings, which must be the same length """
shortest_sup = None
for ssperm in itertools.permutations(ss):
sup = ssperm[0] # superstring starts as first string
for i in range(len(ss)-1):
# overlap adjacent strings A and B in the permutation
olen = overlap(ssperm[i], ssperm[i+1], min_length=1)
# add non-overlapping portion of B to superstring
sup += ssperm[i+1][olen:]
if shortest_sup is None or len(sup) < len(shortest_sup):
shortest_sup = sup # found shorter superstring
return shortest_sup # return shortest
It's possible for there to be multiple different shortest common superstrings for the same set of input strings. Consider the input strings ABC, BCA, CAB. One shortest common superstring is ABCAB but another is BCABC and another is CABCA.
Q1. What is the length of the shortest common superstring of the following strings?
CCT, CTT, TGC, TGG, GAT, ATT
Q2. How many different shortest common superstrings are there for the input strings given in the previous question?
def revised_scs(ss):
""" Returns shortest common superstring of given
strings, which must be the same length """
shortest_sup = []
for ssperm in itertools.permutations(ss):
sup = ssperm[0] # superstring starts as first string
for i in range(len(ss)-1):
# overlap adjacent strings A and B in the permutation
olen = overlap(ssperm[i], ssperm[i+1], min_length=1)
# add non-overlapping portion of B to superstring
sup += ssperm[i+1][olen:]
shortest_sup.append(sup) # found shorter superstring
shortest_len = len(ss) * len(ss[0])
for sup in shortest_sup:
if len(sup) <= shortest_len:
shortest_len = len(sup)
shortest_sup = [sup for sup in shortest_sup if len(sup) == shortest_len]
return list(set(shortest_sup)) # return shortest
# Question 1, 2
shortest_sup_list = revised_scs(['CCT', 'CTT', 'TGC', 'TGG', 'GAT', 'ATT'])
print("Length of the shortest common superstring:", len(shortest_sup_list[0]))
print("Number of different shortest common superstrings:", len(shortest_sup_list))
Assemble reads using one of the approaches discussed, such as greedy shortest common superstring. Since there are many reads, you might consider ways to make the algorithm faster, such as the one discussed in the programming assignment in the previous module.
Q3. How many As are there in the full, assembled genome?
Q4. How many Ts are there in the full, assembled genome from the previous question?
import operator
def readFastq(filename):
sequences = []
qualities = []
with open(filename) as fh:
while True:
fh.readline() # skip name line
seq = fh.readline().rstrip() # read base sequence
fh.readline() # skip placeholder line
qual = fh.readline().rstrip() # base quality line
if len(seq) == 0:
break
sequences.append(seq)
qualities.append(qual)
return sequences, qualities
def smart_overlap_map(reads, k):
olaps = {}
result = {}
for read in reads:
for i in range(len(read)-k+1):
if read[i:i+k] not in olaps:
olaps[read[i:i+k]] = [read]
else:
olaps[read[i:i+k]].append(read)
count = 0
for read in reads:
read_suffix = read[-k:]
for possible_read in olaps[read_suffix]:
if possible_read != read:
olen = overlap(read, possible_read, k)
if olen > 0:
count += 1
result[(read, possible_read)] = olen
return result, count
def pick_maximal_overlap(reads, k):
""" Return a pair of reads from the list with a
maximal suffix/prefix overlap >= k. Returns
overlap length 0 if there are no such overlaps."""
reada, readb = None, None
best_olen = 0
for a, b in itertools.permutations(reads, 2):
olen = overlap(a, b, min_length=k)
if olen > best_olen:
reada, readb = a, b
best_olen = olen
return reada, readb, best_olen
def smart_greedy_scs(reads, k):
""" Greedy shortest-common-superstring merge.
Repeat until no edges (overlaps of length >= k)
remain. """
pairs_olen, pairs_count = smart_overlap_map(reads, k)
sorted_pairs_olen = sorted(pairs_olen.items(), key=operator.itemgetter(1), reverse=True)
read_a, read_b, olen = sorted_pairs_olen[0][0][0], sorted_pairs_olen[0][0][1], sorted_pairs_olen[0][1]
while olen > 0:
reads.remove(read_a)
reads.remove(read_b)
reads.append(read_a + read_b[olen:])
pairs_olen, pairs_count = smart_overlap_map(reads, k)
if pairs_olen != {}:
sorted_pairs_olen = sorted(pairs_olen.items(), key=operator.itemgetter(1), reverse=True)
read_a, read_b, olen = sorted_pairs_olen[0][0][0], sorted_pairs_olen[0][0][1], sorted_pairs_olen[0][1]
else:
read_a, read_b, olen = pick_maximal_overlap(reads, k)
return ''.join(reads)
# Question 3, 4
reads_filename = 'ads1_week4_reads.fastq'
fastq_reads, _ = readFastq(reads_filename)
genome = smart_greedy_scs(fastq_reads, 10)
print("Number of As in the assembled genome:", genome.count('A'))
print("Number of Ts in the assembled genome:", genome.count('T'))