My favorites | Sign in
Project Home Downloads Wiki Issues Source
Checkout   Browse   Changes    
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
#!/usr/bin/env python

# Adatron SVM with polynomial kernel
# placed in the public domain by Stavros Korokithakis

import sys
from math import exp

CYTOSOLIC = 0
EXTRACELLULAR = 1
NUCLEAR = 2
MITOCHONDRIAL = 3
BLIND = 4

D = 5.0

LENGTH = 50

PROTEINS = []

AMINOACIDS = "ACDEFGHIKLMNPQRSTVWY"

class Protein:
def __init__(self, name, mass, isoelectric_point, size, sequence, type):
self.name = name
self.mass = mass
self.isoelectric_point = isoelectric_point
self.size = size
self.sequence = sequence
self.type = type
self.extract_composition()

def extract_composition(self):
self.local_composition = dict(((x, 0.0) for x in AMINOACIDS))
for counter in range(LENGTH):
self.local_composition[self.sequence[counter]] += 1.0 / LENGTH
self.global_composition = dict(((x, 0.0) for x in AMINOACIDS))
for aminoacid in self.sequence:
self.global_composition[aminoacid] += 1.0 / len(self.sequence)

def create_vector(self):
vector = []
for key, value in sorted(self.local_composition.items()):
vector.append(value)
for key in sorted(self.global_composition.keys()):
vector.append(value)
return vector


def load_file(filename, type):
global PROTEINS
protfile = open(filename)
for line in protfile:
if line.startswith("name"):
continue
name, mass, isoelectric_point, size, sequence = line.strip().split("\t")
protein = Protein(name, mass, isoelectric_point, size, sequence, type)
PROTEINS.append(protein)
protfile.close()


def create_tables():
"""Create the feature and label tables."""
feature_table = []
label_table = []

for protein in PROTEINS:
feature_table.append(protein.create_vector())

for protein in PROTEINS:
if protein.type == BLIND:
continue
labels = [-1] * 4
# Invert the sign of the label our protein belongs to.
labels[protein.type] *= -1
label_table.append(labels)

return feature_table, label_table


def create_kernel_table(feature_table):
kernel_table = []
for row in feature_table:
kernel_row = []
for candidate in feature_table:
difference = 0.0
for counter in range(len(row)):
difference += (row[counter] - candidate[counter]) ** 2
kernel_row.append(exp(-D*difference))
kernel_table.append(kernel_row)
return kernel_table


def train_adatron(kernel_table, label_table, h, c):
tolerance = 0.5
alphas = [([0.0] * len(kernel_table)) for _ in range(len(label_table[0]))]
betas = [([0.0] * len(kernel_table)) for _ in range(len(label_table[0]))]
bias = [0.0] * len(label_table[0])
labelalphas = [0.0] * len(kernel_table)
max_differences = [(0.0, 0)] * len(label_table[0])
for iteration in range(10*len(kernel_table)):
print "Starting iteration %s..." % iteration
if iteration == 20: # XXX shedskin test
return alphas, bias
for klass in range(len(label_table[0])):
max_differences[klass] = (0.0, 0)
for elem in range(len(kernel_table)):
labelalphas[elem] = label_table[elem][klass] * alphas[klass][elem]
for col_counter in range(len(kernel_table)):
prediction = 0.0
for row_counter in range(len(kernel_table)):
prediction += kernel_table[col_counter][row_counter] * \
labelalphas[row_counter]
g = 1.0 - ((prediction + bias[klass]) * label_table[col_counter][klass])
betas[klass][col_counter] = min(max((alphas[klass][col_counter] + h * g), 0.0), c)
difference = abs(alphas[klass][col_counter] - betas[klass][col_counter])
if difference > max_differences[klass][0]:
max_differences[klass] = (difference, col_counter)

if all([max_difference[0] < tolerance for max_difference in max_differences]):
return alphas, bias
else:
alphas[klass][max_differences[klass][1]] = betas[klass][max_differences[klass][1]]
element_sum = 0.0
for element_counter in range(len(kernel_table)):
element_sum += label_table[element_counter][klass] * alphas[klass][element_counter] / 4
bias[klass] = bias[klass] + element_sum

def calculate_error(alphas, bias, kernel_table, label_table):
prediction = 0.0
predictions = [([0.0] * len(kernel_table)) for _ in range(len(label_table[0]))]
for klass in range(len(label_table[0])):
for col_counter in range(len(kernel_table)):
for row_counter in range(len(kernel_table)):
prediction += kernel_table[col_counter][row_counter] * \
label_table[row_counter][klass] * alphas[klass][row_counter]
predictions[klass][col_counter] = prediction + bias[klass]

for col_counter in range(len(kernel_table)):
current_predictions = []
error = 0
for row_counter in range(len(label_table[0])):
current_predictions.append(predictions[row_counter][col_counter])

predicted_class = current_predictions.index(max(current_predictions))

if label_table[col_counter][predicted_class] < 0:
error += 1

return 1.0 * error / len(kernel_table)


def main():
for filename, type in [("testdata/c.txt", CYTOSOLIC), ("testdata/e.txt", EXTRACELLULAR), ("testdata/n.txt", NUCLEAR), ("testdata/m.txt", MITOCHONDRIAL)]:#, ("b.txt", BLIND)]:
load_file(filename, type)
print "Creating feature tables..."
feature_table, label_table = create_tables()
#import pickle
#print "Loading kernel table..."
#kernel_file = file("kernel_table.txt")
#kernel_table = pickle.load(kernel_file)
#kernel_file.close()
print "Creating kernel table..."
kernel_table = create_kernel_table(feature_table)
print "Training SVM..."
alphas, bias = train_adatron(kernel_table, label_table, 1.0, 3.0)
print calculate_error(alphas, bias, kernel_table, label_table)


if __name__ == "__main__":
main()

Change log

r1429 by mark.dufour on Apr 26, 2010   Diff
adatron example (stavros korokithakis)
Go to: 
Project members, sign in to write a code review

Older revisions

All revisions of this file

File info

Size: 6315 bytes, 172 lines
Powered by Google Project Hosting