My favorites | Sign in
Project Home Downloads Wiki Issues Source
Checkout   Browse   Changes    
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#!/usr/bin/env python

"""
Naive bayes classification.

Takes a training set and a test set, each as a csv file, and learns a naive
bayes classifier based on the training set. Then run the classifier against
each member of the test set and print out the accuracy.

For the values in the csvs, the first value is the class of that instance; the
rest are the attributes that we'll use for classification. (all instances
should have the same number of attributes)
"""

from __future__ import division
from collections import defaultdict
from functools import reduce
import math

class Instance(object):
"""An instance has a class (cl) and a list of attributes."""
def __init__(self, cl, attributes):
self.cl = cl
self.attributes = attributes

## TODO: add the m-estimates, so we can get something like smoothing here too.
def estimate_probabilities(training):
"""Estimate the probability of each attribute value, given each class, from
the training data. Returns a pair of dictionaries:

The first is the "class probabilities", and goes from classes to
probabilities.

The second is the "attribute probabilities" and goes from
(cl,attribute-pos,attribute-value) to probabilities."""

attributecounts = defaultdict(lambda:0)
classcounts = defaultdict(lambda:0)

# x is an instance in the training set.
for x in training:
cl = x.cl
classcounts[cl] += 1
for (pos,xi) in zip(range(len(x.attributes)), x.attributes):
attributecounts[(cl, pos, xi)] += 1

classprobs = defaultdict(lambda:0)
n = len(training)
for cl in classcounts.keys():
classprobs[cl] = classcounts[cl] / n

## probability of seeing that field assigned that value, given that it's a
## member of that class.
attributeprobs = defaultdict(lambda:0)
## one key for each combination of class and attribute/value. Value is the
for key in attributecounts.keys():
attributeprobs[key] = attributecounts[key] / classcounts[key[0]]
return classprobs, attributeprobs

def load_dataset(fn):
with open(fn) as infile:
lines = infile.readlines()
out = []
for line in lines:
splitted = line.strip().split(",")
cl = splitted[0]
attributes = splitted[1:]
instance = Instance(cl, attributes)
out.append(instance)
return out

def product(nums):
return reduce(lambda x,y: x*y, nums)

def classify(instance, class_probs, attribute_probs):
"""Return a classification for this instance, given the class probabilities
and attribute probabilities."""

possible_cls = list(class_probs.keys())

attr_pairs = [(pos, value)
for (pos,value) in zip(range(len(instance.attributes)),
instance.attributes)]

scores = [(class_probs[cl] *
product([attribute_probs[(cl, pos, value)]
for (pos,value) in attr_pairs]) )
for cl in possible_cls]
maxindex = scores.index(max(scores))
maxclass = possible_cls[maxindex]
correct = (instance.cl == maxclass)
print("%s %s %s" %
(maxclass, ("Correct!" if correct else "Wrong!"), scores))

import sys
def main():
if len(sys.argv) != 3:
print("usage: %s training.csv test.csv" % (sys.argv[0],))

training = load_dataset(sys.argv[1])
classprobs,attributeprobs = estimate_probabilities(training)

test = load_dataset(sys.argv[2])
for instance in test:
classify(instance, classprobs, attributeprobs)

if __name__ == "__main__": main()

Change log

r501 by alex.rudnick on Oct 9, 2010   Diff
some cleanup, also make it work under
python 2 and python3
Go to: 
Project members, sign in to write a code review

Older revisions

r500 by alex.rudnick on Oct 9, 2010   Diff
Simple Naive Bayes classifier
All revisions of this file

File info

Size: 3641 bytes, 107 lines

File properties

svn:executable
*
Powered by Google Project Hosting