python抓取

  1. 1、下载文档前请自行甄别文档内容的完整性,平台不提供额外的编辑、内容补充、找答案等附加服务。
  2. 2、"仅部分预览"的文档,不可在线预览部分如存在完整性等问题,可反馈申请退款(可完整预览的文档不适用该条件!)。
  3. 3、如文档侵犯您的权益,请联系客服反馈,我们会尽快为您处理(人工客服工作时间:9:00-18:30)。

• • • • • • • • • • • • • • •
def set_access_token(self, access_token, expires_in): self.access_token = str(access_token) self.expires = float(expires_in) def get_authorize_url(self, redirect_uri=None, display='default'): ''' 返回authroize URL应该重定向. ''' redirect = redirect_uri if redirect_uri else self.redirect_uri if not redirect: raise APIError('21305', 'Parameter absent: redirect_uri', 'OAuth2 request') return '%s%s?%s' % (self.auth_url, 'authorize', \ _encode_params(client_id = self.client_id, \ response_type = 'code', \ display = display, \ redirect_uri = redirect))
•Biblioteka Baidu• • • • • • • • • • • • • • •
class APIClient(object): ''' 使用同步调用的API客户端. ''' def __init__(self, app_key, app_secret, redirect_uri=None, response_type='code', domain='api.weibo.com', version='2'): self.client_id = app_key self.client_secret = app_secret self.redirect_uri = redirect_uri self.response_type = response_type self.auth_url = 'https://%s/oauth2/' % domain self.api_url = 'https://%s/%s/' % (domain, version) self.access_token = None self.expires = 0.0 self.get = HttpObject(self, _HTTP_GET) self.post = HttpObject(self, _HTTP_POST) self.upload = HttpObject(self, _HTTP_UPLOAD)
• • •
• • • • • • • • • • •
def request_access_token(self, code, redirect_uri=None): ''' 返回访问令牌的对象:{“ACCESS_TOKEN”:“你的访问令牌”, “expires_in”:12345678} ''' redirect = redirect_uri if redirect_uri else self.redirect_uri if not redirect: raise APIError('21305', 'Parameter absent: redirect_uri', 'OAuth2 request') r = _http_post('%s%s' % (self.auth_url, 'access_token'), \ client_id = self.client_id, \ client_secret = self.client_secret, \ redirect_uri = redirect, \ code = code, grant_type = 'authorization_code') r.expires_in += int(time.time()) return r
程序的功能
• 通过python程序实现接入新 浪发微博
源代码
• • • • • • • • • • #!/usr/bin/env python # -*- coding: utf-8 -*try: import json except ImportError: import simplejson as json import time Import urllib Import urllib2 import logging
• _CONTENT_TYPES = { '.png': 'image/png', '.gif': 'image/gif', '.jpg': 'image/jpeg', '.jpeg': 'image/jpeg', '.jpe': 'image/jpeg' }
• def _guess_content_type(ext): • return _CONTENT_TYPES.get(ext, 'application/octet-stream') • _HTTP_GET = 0 • _HTTP_POST = 1 • _HTTP_UPLOAD = 2 • def _http_get(url, authorization=None, **kw): • logging.info('GET %s' % url) • return _http_call(url, _HTTP_GET, authorization, **kw) • def _http_post(url, authorization=None, **kw): • logging.info('POST %s' % url) • return _http_call(url, _HTTP_POST, authorization, **kw)
• • • •
• • • • •

if authorization: req.add_header('Authorization', 'OAuth2 %s' % authorization) if boundary: req.add_header('Content-Type', 'multipart/form-data; boundary=%s' % boundary) resp = urllib2.urlopen(req) body = resp.read() r = json.loads(body, object_hook=_obj_hook) if hasattr(r, 'error_code'): raise APIError(r.error_code, getattr(r, 'error', ''), getattr(r, 'request', '')) return r
• • def __str__(self): return 'APIError: %s: %s, request: %s' % (self.error_code, self.error, self.request)
一般的json对象既可以绑定任何对象,也可以作为字典
• • • • • • • • • • • • • class JsonObject(dict): def __getattr__(self, attr): return self[attr] def __setattr__(self, attr, value): self[attr] = value def _encode_params(**kw):编码参数 args = [] for k, v in kw.iteritems(): qv = v.encode('utf-8') if isinstance(v, unicode) else str(v) args.append('%s=%s' % (k, urllib.quote(qv))) return '&'.join(args)
• else: • data.append('Content-Disposition: form-data; name="%s"\r\n' % k) • data.append(v.encode('utf-8') if isinstance(v, unicode) else v) • data.append('--%s--\r\n' % boundary) • return '\r\n'.join(data), boundary
• class HttpObject(object): • • • • • • • • def __init__(self, client, method): self.client = client self.method = method
def __getattr__(self, attr): def wrap(**kw): if self.client.is_expires(): raise APIError('21327', 'expired_token', attr) return _http_call('%s%s.json' % (self.client.api_url, attr.replace('__', '/')), self.method, self.client.access_token, **kw) • return wrap
• 导入必须的模块
定义函数将json对象转化为python对象
• def _obj_hook(pairs): • • o = JsonObject() • for k, v in pairs.iteritems(): • o[str(k)] = v • return o
定义一个类实现报错功能
• class APIError(StandardError): • • def __init__(self, error_code, error, request): • self.error_code = error_code • self.error = error • self.request = request • StandardError.__init__(self, error)
• • • • • • • • • • • • • • •
def _http_upload(url, authorization=None, **kw): logging.info('MULTIPART POST %s' % url) return _http_call(url, _HTTP_UPLOAD, authorization, **kw) def _http_call(url, method, authorization, **kw): ''' 发送一个HTTP请求,并期望,如果没有错误,返回一个JSON对象''' params = None boundary = None if method==_HTTP_UPLOAD: params, boundary = _encode_multipart(**kw) else: params = _encode_params(**kw) http_url = '%s?%s' % (url, params) if method==_HTTP_GET else url http_body = None if method==_HTTP_GET else params req = urllib2.Request(http_url, data=http_body)
• • • • • • • • • • • • • • • • • •
def _encode_multipart(**kw)(建立一个多部分/窗体的数据体产生的随机边界) boundary = '----------%s' % hex(int(time.time() * 1000)) data = [] for k, v in kw.iteritems(): data.append('--%s' % boundary) if hasattr(v, 'read'): # file-like object: ext = '' filename = getattr(v, 'name', '') n = filename.rfind('.') if n != (-1): ext = filename[n:].lower() content = v.read() data.append('Content-Disposition: form-data; name="%s"; filename="hidden"' % k) data.append('Content-Length: %d' % len(content)) data.append('Content-Type: %s\r\n' % _guess_content_type(ext)) data.append(content)
相关文档
最新文档