#!/usr/bin/python2.4 # # Copyright (c) 2006 Google Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Extracts meta information from Google Base feeds. This modules makes an unauthenticated HTTP connection to Google Base to get meta information and make it available in a convenient form. """ import urllib import urllib2 import urlparse from xml.dom import minidom BASE_URL = 'http://base.google.com/' GM_NAMESPACE_URI = 'http://base.google.com/ns-metadata/1.0' G_NAMESPACE_URI = 'http://base.google.com/ns/1.0' ATOM_NAMESPACE_URI = 'http://www.w3.org/2005/Atom' class GoogleBaseService(object): """Communicates with Google Base feeds. """ def __init__(self, developer_key, base_url = BASE_URL): """Creates a service object and initializez it with the developer key. Args: developer_key: developer key issued for this application base_url: Google Base server and path to connect to, used as the base url for connecting to the feeds. (default: http://base.google.com/) """ self._base_url = urlparse.urljoin(base_url, '/base/feeds/') self._opener = urllib2.build_opener() self._opener.addheaders = [('X-Google-Key', 'key=' + developer_key)] def listMostCommonItemTypeAttributes(self, item_type, max_results=25, max_values=5): """Gets a list of the attributes most commonly used for an item type. Args: item_type: item type name max_results: maximum number of attributes to query for (25 by default) max_values: maximum number of example values to query for (5 by default) Returns: most commonly used attributes and their values, an object of type MostCommonAttributes """ feed = self.run_query('attributes', bq='[item type: %s]' % (item_type), max_results=max_results, max_values=max_values, refine='true') # Merge text nodes, to make it easier to work with feed.normalize() retval = MostCommonAttributes() for attribute in feed.getElementsByTagNameNS(GM_NAMESPACE_URI, 'attribute'): textValues = [] for value in attribute.getElementsByTagNameNS(GM_NAMESPACE_URI, 'value'): text = ''.join([c.toxml('utf-8') for c in value.childNodes]) textValues.append(text) retval.addAttribute((attribute.getAttribute('name'), attribute.getAttribute('type')), textValues) return retval def run_query(self, feed='snippets', **kargs): """Run a query on a feed. Args: feed: Google Base feed name (snippets, items, ...). "snippets" by default **kargs: feed parameters as keyword arguments (q, bq, max_results, ...) different feeds take different parameters. Replace - by _ in the parameter names. Returns: answer, as a DOM tree """ parameters = [ (key.replace('_', '-'), kargs[key]) for key in kargs] relative_url = '%s?%s' % (feed, urllib.urlencode(parameters)) url = urlparse.urljoin(self._base_url, relative_url) print url handle = self._opener.open(url) try: return minidom.parse(handle) finally: handle.close() class MostCommonAttributes(object): """Keeps the most common attributes and their examples. This class works very much like a map of attribute (name, type) to values that would keep the keys in order. The attributes are ordered from the most common to the least common. """ def __init__(self): self._attributes = [] self._values = {} def getExamplesFor(self, attribute): return self._values[attribute] def addAttribute(self, attribute, values): # This attribute is always there, but it's not interesting if attribute == ('item type', 'text'): return self._attributes.append(attribute) self._values[attribute] = values def __getAttributes(self): return self._attributes attributes = property(__getAttributes) _TOESCAPE = '|:="[]()*#<>\\+-' def escape(to_escape): """Escape special characters in a string for inclusion in a Google Base query. Args: to_escape: a string Returns: the same string with a backslash in from on some special characters """ def escapeChar(c): if c in _TOESCAPE: return '\\' + c else: return c return ''.join([escapeChar(c) for c in to_escape])