EEEB GU4055
        
        1. Review notebook assignments: Scientific Python.
        
        2. Discuss the assigned reading: Genome Annotation.
        
        3. Introduce new topic: BLAST & Homology.
        
    
 
    "I'm having trouble learning Python, am I completely lost?
    
    
    No, we expect you to learn through 
    experience. Keep trying to complete the exercises. 
    Run the notebooks again with the posted answers. Seek out help. 
    As we said, most future exercises will recycle and reuse the coding 
    concepts that we've learned so far. 
    
Open and close files in Python.
  # Open a file object
  infile = open("./datafiles/data.txt", "r")
  # read data as a string
  data = infile.read()
  # close the file object
  infile.close()
        
        
  # A simpler alternative
  with open("./datafiles/data.txt", 'r') as infile:
      data = infile.read()
        
    Open and close files in Python.
  # Open a file object for writing
  outfile = open("./datafiles/data.txt", "w")
  # write data as a string
  outfile.write("hello world")
  # close the file object
  outfile.close()
        
        
  # A simpler alternative
  with open("./datafiles/data.txt", 'w') as outfile:
      outfile.write("hello world")
        
    Python objects.
  # Dictionaries are used to store key/value pairs
  {"a key": ["and value in a dictionary"]}
        
        
  # dicts can be created using curly brackets or the dict() func.
  dict([("a", 3), ("b", 4), ("c", 5)])
        
    List comprehension methods: a shortcut.
  # list-comprehension example for list objects
  newlist = [i for i in range(10)]
  newlist          
        
        
  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
        
        
  # list comprehension for a dictionary from a list of lists
  ddict = {i: j for (i, j) in [['a', 1], ['b', 2], ['c', 3]]}
  ddict
        
        
  {'a': 1, 'b': 2, 'c': 3}
        
    Action 1: Add comments to the code below.
# comment: import the random library
import random
# comment: create ea list with 1000 random numbers between 0-10
integer_list = [random.randint(0, 10) for i in range(1000)]
# comment: create an empty dictionary
counter = {}
# comment: iterate over elements of the integer list
for item in integer_list:
    
    # comment: conditional True if item is not already in the dict keys
    if item not in counter:
        # comment: set the value to 1 for this key
        counter[item] = 1
    
    # comment: item is already in dict keys 
    else:
        # comment: increment value by 1 for this key
        counter[item] += 1
        
    
  GENCODE = {
      'ATA': 'I', 'ATC': 'I', 'ATT': 'I', 'ATG': 'M', 
      'ACA': 'T', 'ACC': 'T', 'ACG': 'T', 'ACT': 'T', 
      'AAC': 'N', 'AAT': 'N', 'AAA': 'K', 'AAG': 'K', 
      'AGC': 'S', 'AGT': 'S', 'AGA': 'R', 'AGG': 'R',                  
      'CTA': 'L', 'CTC': 'L', 'CTG': 'L', 'CTT': 'L', 
      'CCA': 'P', 'CCC': 'P', 'CCG': 'P', 'CCT': 'P', 
      'CAC': 'H', 'CAT': 'H', 'CAA': 'Q', 'CAG': 'Q', 
      'CGA': 'R', 'CGC': 'R', 'CGG': 'R', 'CGT': 'R', 
      'GTA': 'V', 'GTC': 'V', 'GTG': 'V', 'GTT': 'V', 
      'GCA': 'A', 'GCC': 'A', 'GCG': 'A', 'GCT': 'A', 
      'GAC': 'D', 'GAT': 'D', 'GAA': 'E', 'GAG': 'E', 
      'GGA': 'G', 'GGC': 'G', 'GGG': 'G', 'GGT': 'G', 
      'TCA': 'S', 'TCC': 'S', 'TCG': 'S', 'TCT': 'S', 
      'TTC': 'F', 'TTT': 'F', 'TTA': 'L', 'TTG': 'L', 
      'TAC': 'Y', 'TAT': 'Y', 'TAA': '_', 'TAG': '_', 
      'TGC': 'C', 'TGT': 'C', 'TGA': '_', 'TGG': 'W', 
  } 
  # get amino acid for a particular codon
  GENCODE["CTA"]
            
                
  "L"
        
    Create a string of DNA with a coding block between two non-coding blocks.
  # use a random seed so we will all draw the same random values each time (repeatable) 
  random.seed(12345)
  # create a genomic region containing a coding region surrounded by non-coding DNA
  region = random_dna(300) + "ATG" + random_coding_dna(100) + "TGA" +  random_dna(300)
  region
            
                
  'TAGGCGTCGATGCCGATCCCACGGATGATAACCGATACTCGACATCCGTCACGACCGGCTGAAATATCAGCATAATGTCGACATCGCCCCGCAACATCAGTATTCCCAGGCTCCCTTGAATCCCCGGCAGTAGAACGAGTGTGTGGTTAGTACGCAAAACTTCGGCGGTAGGATCCACGCGTCACAAGTGACATCCGGCGAAACTACGCTTTAGATGAGTTAGGTGCTAATAACAAGCATTTATCCGCTCTCCCCTACAAAAGCCGCTGTTCTAAGCTTATTAGCTGTACCTGCAGATGCATGTCTGATCCAATCGGTCAACACAGATGGGCACTCAGGACGTGTTCTAGACTCTGTGCAATAATCAAAGCCGCGTCTTGGTCTAATCCGAGAAATTTAGACGACCCAGTCCTTATCAGACGACAATGTGGAGCGCAATCTGAAGATGGTGAGTTCCGCGCCGCACTGGTCCTTGTTACCGAGCGTTTGGGCCGTTGTGAAAAGTGCCGAACAGGGACTGGTTGCTTAAACCTGGAGCCCTATCAGGGTCAACGTACGCATGGCGAAAACTCACGAAGGGACATATCCCGGAAAGATATACCTTGACGCTCGGGTAGCTAGTTCGGCTTATGCTTCGTGCTGACCAATCGACCAAGGCGGGGTAATTGCGACGACCCGCGGAACCACAACTTTACCCTAGACAAGCGGCGCGTAGCGTCCTATCGCCGGGAGTCTAACTCAAATCATATGGCCCATCGCAGTGCGTGAGTTTTATTCAGCCCACCCCAACAAGAGATCGAAATAGTAATCTGTCTCTCTGCTATGATGAGACAATGTCCGTACACTCACTACTTGTTGTACAGTAGATATTCAACCTTAGTGGTTGGTACCTTAGGGTGGGCGAAT'
        
    
  # find the first occurrence 
  region.index("ATG")
  # 9
        
        
  # split on ATG and count items in the resulting list
  len(region.split("ATG"))
  # 16
        
        
  # best: get a list with every ATG start position
  starts = [i for (i, j) in enumerate(region) if region[i: i+3] == "ATG"]
  # [9, 24, 74, 214, 296, 300, 326, 425, 445, 559, 629, 747, 822, 825, 833]
        
    
# create a 3-dimensional array
np.zeros((5, 3, 3))
            
                
array([[[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],
       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],
       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],
       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],
       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]]])
        
    
# create an array with randomly generated values 
df = pd.DataFrame({
    "column1": np.random.normal(0, 1, 10),
    "column2": np.random.choice(list("ACGT"), 10), 
    "column3": np.random.randint(0, 5, 10),
})
# return the dataframe
df
            
                
  column1   column2   column3
0   1.633686    T   4
1   0.071054    G   4
2   -0.201375   T   0
3   0.461895    A   3
4   -1.480207   G   2
5   -0.729692   C   0
6   -0.143699   A   4
7   -1.108858   G   2
8   -1.704039   A   1
9   -2.760000   A   1
        
    
    Complete Unit 4 on Codio: 2 notebooks.
    
    
    “OrthoDB: A Hierarchical Catalog of Animal, Fungal and Bacterial Orthologs.” Nucleic Acids Research 41 (Database issue): D358–65. 
    https://doi.org/10.1093/nar/gks1116