My favorites
▼
|
Sign in
libsvm288fork
fork of lib svm 2.88 allows access to |w|^2 and other model variables from Python
Project Home
Downloads
Wiki
Issues
Source
Checkout
Browse
Changes
Source path:
svn
/
trunk
/
python
/
learner.py
r5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
'''
###############################################
learner.py
Byron C Wallace
Tufts Medical Center
This module represents a learner. Includes active learning.
###############################################
'''
import pdb
import random
import svm
from svm import *
import dataset
def evaluate_learner(learner, include_labeled_data_in_metrics=True):
'''
Returns a dictionary containing various metrics for learner performance, as measured over the
examples in the unlabeled_datasets belonging to the learner.
'''
results = {}
# first we count the number of true positives and true negatives discovered in learning. this is so we do not
# unfairly penalize active learning strategies for finding lots of the minority class during training.
if include_labeled_data_in_metrics:
tps = learner.labeled_datasets[0].number_of_minority_examples()
tns = learner.labeled_datasets[0].number_of_majority_examples()
results["npos"] = tps
print "positives found during learning: %s\nnegatives found during learning: %s" % (tps, tns)
print "evaluating learner over %s instances." % len(learner.unlabeled_datasets[0].instances)
fps, fns = 0, 0
# get the raw points out for prediction
point_sets = [dataset.get_samples() for dataset in learner.unlabeled_datasets]
# the labels are assumed to be the same; thus we only use the labels for the first dataset
true_labels = learner.unlabeled_datasets[0].get_labels()
# loop over all of the examples, and feed to the "cautious_classify" method
# the corresponding point in each feature-space
predictions = []
for example_index in range(len(point_sets[0])):
prediction = learner.cautious_predict([point_sets[feature_space_index][example_index] for feature_space_index in range(len(point_sets))])
predictions.append(prediction)
conf_mat = svm.evaluate_predictions(predictions, true_labels)
#
# evaluate_predictions does not include the instances found during training!
#
conf_mat["tp"]+= tps
conf_mat["tn"]+= tns
print "confusion matrix:"
print conf_mat
results["confusion_matrix"] = conf_mat
results["accuracy"] = float (conf_mat["tp"] + conf_mat["tn"]) / float(sum([conf_mat[key] for key in conf_mat.keys()]))
if float(conf_mat["tp"]) == 0:
results["sensitivity"] = 0
else:
results["sensitivity"] = float(conf_mat["tp"]) / float(conf_mat["tp"] + conf_mat["fn"])
return results
class learner:
labeled_datasets = []
unlabeled_datasets = []
picked_during_al = []
# we need a param for each model
params = []
models = None
def __init__(self, unlabeled_datasets = [], models=None):
# just using default parameter for now
self.params = [svm_parameter(weight=[1, 1000]) for d in unlabeled_datasets]
self.unlabeled_datasets = unlabeled_datasets
# initialize empty labeled datasets (i.e., all data is unlabeled to begin with)
self.labeled_datasets = [dataset.dataset([]) for d in unlabeled_datasets]
self.models = models
def active_learn(self, num_examples_to_label, query_function = None, num_to_label_at_each_iteration=10, rebuild_models_at_each_iter=True):
''''
Active learning loop. Uses the provided query function (query_function) to select a number of examples
(num_to_label_at_each_iteration) to label at each step, until the total number of examples requested
(num_examples_to_label) has been labeled. The models will be updated at each iteration.
'''
if not query_function:
query_function = self.SIMPLE
labeled_so_far = 0
while labeled_so_far < num_examples_to_label:
print "labeled %s out of %s" % (labeled_so_far, num_examples_to_label)
example_ids_to_label = query_function(num_to_label_at_each_iteration)
# now remove the selected examples from the unlabeled sets and put them in the labeled sets.
self.label_instances_in_all_datasets(example_ids_to_label)
if rebuild_models_at_each_iter:
self.rebuild_models()
print "models rebuilt with %s labeled examples" % len(self.labeled_datasets[0].instances)
labeled_so_far += num_to_label_at_each_iteration
if not rebuild_models_at_each_iter:
self.rebuild_models()
print "active learning loop completed; models rebuilt."
def label_instances_in_all_datasets(self, inst_ids):
'''
Removes the instances in inst_ids (a list of instance numbers to 'label') from the unlabeled dataset(s) and places
them in the labeled dataset(s). These will subsequently be used in training models, thus this simulates 'labeling'
the instances.
'''
for unlabeled_dataset, labeled_dataset in zip(self.unlabeled_datasets, self.labeled_datasets):
labeled_dataset.add_instances(unlabeled_dataset.remove_instances(inst_ids))
def cautious_predict(self, X):
if self.models and len(self.models):
return max([m.predict(x) for m,x in zip(self.models, X)])
else:
raise Exception, "No models have been initialized."
def pick_initial_training_set(self, k, build_models=True):
'''
Select a set of training examples from the dataset(s) at random. This set will be used
to build the initial model. The **same training examples will be selected from each dataset.
'''
self.label_at_random(k)
if build_models:
print "building models..."
self.rebuild_models()
print "done."
def undersample_labeled_datasets(self, k=None):
'''
Undersamples the current labeled datasets
'''
if self.labeled_datasets and len(self.labeled_datasets) and (len(self.labeled_datasets[0].instances)):
if not k:
print "undersampling majority class to equal that of the minority examples"
k = self.labeled_datasets[0].number_of_majority_examples() - self.labeled_datasets[0].number_of_minority_examples()
# we copy the datasets rather than mutate the class members.
copied_datasets = [dataset.dataset(list(d.instances)) for d in self.labeled_datasets]
print "removing %s majority instances" % k
removed_instances = copied_datasets[0].undersample(k)
# get the removed instance numbers
removed_instance_nums = [inst.id for inst in removed_instances]
# if there is more than one feature-space, remove the same instances from the remaining spaces (sets)
for labeled_dataset in copied_datasets[1:]:
# now remove them from the corresponding sets
labeled_dataset.remove_instances(removed_instance_nums)
else:
raise Exception, "No labeled data has been provided!"
return copied_datasets
def label_maximally_diverse_set(self, k, label_one_initially=True):
'''
Returns the instance numbers for the k most diverse examples (selected greedily)
'''
# first, label one example at random
if label_one_initially:
self.label_at_random(1)
self.rebuild_models()
# just use the first dataset for now....
# TODO implement coin flip, etc
model = self.models[0]
# diversity function
div_function = lambda x: sum([model.compute_cos_between_examples(x.point, y.point) for y in self.labeled_datasets[0].instances])
#for x in self.unlabeled_datasets[0].instances[:k]:
for step in range(k-1):
if not step%100:
print "on step %s" % k
# add examples iteratively, selecting the most diverse w.r.t. to the examples already selected in each step
# first compute diversity scores for all unlabeled instances
x = self.unlabeled_datasets[0].instances[0]
most_diverse_id = x.id
most_diverse_score = div_function(x)
for x in self.unlabeled_datasets[0].instances[1:]:
# now iterate over the remaining unlabeled examples
cur_div_score = div_function(x)
if cur_div_score > most_diverse_score:
most_diverse_score = cur_div_score
most_diverse_id = x.id
# now label the most diverse example
self.label_instances_in_all_datasets([most_diverse_id])
print "building models..."
self.rebuild_models()
print "done."
def label_at_random(self, k):
'''
Select and 'label' a set of k examples from the (unlabeled) dataset(s) at random.
'''
if self.unlabeled_datasets and len(self.unlabeled_datasets):
# remove a random subset of instances from one of our datasets (it doesn't matter which one)
removed_instances = self.unlabeled_datasets[0].get_and_remove_random_subset(k)
# add this set to the labeled data
self.labeled_datasets[0].add_instances(removed_instances)
# get the removed instance numbers
removed_instance_nums = [inst.id for inst in removed_instances]
# if there is more than one feature-space, remove the same instances from the remaining spaces (sets)
for unlabeled_dataset, labeled_dataset in zip(self.unlabeled_datasets[1:], self.labeled_datasets[1:]):
# now remove them from the corresponding sets
labeled_dataset.add_instances(unlabeled_dataset.remove_instances(removed_instance_nums))
else:
raise Exception, "No datasets have been provided!"
def get_random_unlabeled_ids(self, k):
'''
Returns a random set of k instance ids
'''
selected_ids = []
ids = self.unlabeled_datasets[0].get_instance_ids()
for i in range(k):
random_id = random.choice(ids)
ids.remove(random_id)
selected_ids.append(random_id)
return selected_ids
def SIMPLE(self, k):
'''
Returns the instance numbers for the k unlabeled instances closest the hyperplane.
'''
# just use the first dataset for now....
# TODO implement coin flip, etc
model = self.models[0]
# initially assume k first examples are closest
k_ids_to_distances = {}
for x in self.unlabeled_datasets[0].instances[:k]:
k_ids_to_distances[x.id] = model.distance_to_hyperplane(x.point)
# now iterate over the rest
for x in self.unlabeled_datasets[0].instances[k:]:
cur_max_id, cur_max_dist = self._get_max_val_key_tuple(k_ids_to_distances)
x_dist = model.distance_to_hyperplane(x.point)
if x_dist < cur_max_dist:
# then x is closer to the hyperplane than the farthest currently observed
# remove current max entry from the dictionary
k_ids_to_distances.pop(cur_max_id)
k_ids_to_distances[x.id] = x_dist
return k_ids_to_distances.keys()
def _get_max_val_key_tuple(self, d):
keys, values = d.keys(), d.values()
max_key, max_val = keys[0], values[0]
for key, value in zip(keys[1:], values[1:]):
if value > max_val:
max_key = key
max_val = value
return (max_key, max_val)
def rebuild_models(self, undersample_first=False):
'''
Rebuilds all models over the current labeled datasets.
'''
if undersample_first:
print "undersampling before building models.."
datasets = self.undersample_labeled_datasets()
print "done."
else:
datasets = self.labeled_datasets
print "training model(s) on %s instances" % len(datasets[0].instances)
self.models = []
for dataset, param in zip(datasets, self.params):
samples, labels = dataset.get_samples_and_labels()
problem = svm_problem(labels, samples)
# find C, gamma parameters for each model
print "finding optimal C, gamma parameters..."
param.C, param.gamma = grid_search(problem, param)
print "C:%s; gamma:%s" % (param.C, param.gamma)
self.models.append(svm_model(problem, param))
print "done."
Show details
Hide details
Change log
r5
by byron.wallace on Apr 7, 2009
Diff
updated active learning framework
Go to:
/trunk/python/dataset.py
/trunk/python/learner.py
Project members,
sign in
to write a code review
Older revisions
All revisions of this file
File info
Size: 12712 bytes, 279 lines
View raw file
File properties
svn:executable
*
Powered by
Google Project Hosting