[phantomjs] add cookie support

This commit is contained in:
Tithen-Firion 2017-04-25 15:12:54 +02:00
parent da57ebaf84
commit 40e41780f1
2 changed files with 66 additions and 4 deletions

View File

@ -2343,10 +2343,12 @@ def _float(self, v, name, fatal=False, **kwargs):
self._downloader.report_warning(msg) self._downloader.report_warning(msg)
return res return res
def _set_cookie(self, domain, name, value, expire_time=None): def _set_cookie(self, domain, name, value, expire_time=None, port=None,
path='/', secure=False, discard=False, rest={}, **kwargs):
cookie = compat_cookiejar.Cookie( cookie = compat_cookiejar.Cookie(
0, name, value, None, None, domain, None, 0, name, value, port, not port is None, domain, True,
None, '/', True, False, expire_time, '', None, None, None) domain.startswith('.'), path, True, secure, expire_time,
discard, None, None, rest)
self._downloader.cookiejar.set_cookie(cookie) self._downloader.cookiejar.set_cookie(cookie)
def _get_cookies(self, url): def _get_cookies(self, url):

View File

@ -3654,6 +3654,37 @@ def write_xattr(path, key, value):
"or the 'xattr' binary.") "or the 'xattr' binary.")
def cookie_to_dict(cookie):
cookie_dict = {
'name': cookie.name,
'value': cookie.value,
};
if cookie.port_specified:
cookie_dict['port'] = cookie.port
if cookie.domain_specified:
cookie_dict['domain'] = cookie.domain
if cookie.path_specified:
cookie_dict['path'] = cookie.path
if not cookie.expires is None:
cookie_dict['expires'] = cookie.expires
if not cookie.secure is None:
cookie_dict['secure'] = cookie.secure
if not cookie.discard is None:
cookie_dict['discard'] = cookie.discard
try:
if (cookie.has_nonstandard_attr('httpOnly') or
cookie.has_nonstandard_attr('httponly') or
cookie.has_nonstandard_attr('HttpOnly')):
cookie_dict['httponly'] = True
except TypeError:
pass
return cookie_dict
def cookie_jar_to_list(cookie_jar):
return [cookie_to_dict(cookie) for cookie in cookie_jar]
class PhantomJSwrapper(object): class PhantomJSwrapper(object):
"""PhantomJS wrapper class""" """PhantomJS wrapper class"""
@ -3674,6 +3705,9 @@ class PhantomJSwrapper(object):
var fs = require('fs'); var fs = require('fs');
var read = {{ mode: 'r', charset: 'utf-8' }}; var read = {{ mode: 'r', charset: 'utf-8' }};
var write = {{ mode: 'w', charset: 'utf-8' }}; var write = {{ mode: 'w', charset: 'utf-8' }};
JSON.parse(fs.read("{cookies}", read)).forEach(function(x) {{
phantom.addCookie(x);
}});
page.settings.resourceTimeout = {timeout}; page.settings.resourceTimeout = {timeout};
page.settings.userAgent = "{ua}"; page.settings.userAgent = "{ua}";
page.onLoadStarted = function() {{ page.onLoadStarted = function() {{
@ -3684,6 +3718,7 @@ class PhantomJSwrapper(object):
}}; }};
var saveAndExit = function() {{ var saveAndExit = function() {{
fs.write("{html}", page.content, write); fs.write("{html}", page.content, write);
fs.write("{cookies}", JSON.stringify(phantom.cookies), write);
phantom.exit(); phantom.exit();
}}; }};
page.onLoadFinished = function(status) {{ page.onLoadFinished = function(status) {{
@ -3697,7 +3732,7 @@ class PhantomJSwrapper(object):
page.open(""); page.open("");
''' '''
_TMP_FILE_NAMES = ['script', 'html'] _TMP_FILE_NAMES = ['script', 'html', 'cookies']
def __init__(self, extractor, timeout=10000): def __init__(self, extractor, timeout=10000):
self.exe = check_executable('phantomjs', ['-v']) self.exe = check_executable('phantomjs', ['-v'])
@ -3722,6 +3757,26 @@ def __del__(self):
except: except:
pass pass
def _save_cookies(self, url):
cookies = cookie_jar_to_list(self.extractor._downloader.cookiejar)
for cookie in cookies:
if 'path' not in cookie:
cookie['path'] = '/'
if 'domain' not in cookie:
cookie['domain'] = compat_urlparse.urlparse(url).netloc
with open(self._TMP_FILES['cookies'].name, 'wb') as f:
f.write(json.dumps(cookies).encode('utf-8'))
def _load_cookies(self):
with open(self._TMP_FILES['cookies'].name, 'rb') as f:
cookies = json.loads(f.read().decode('utf-8'))
for cookie in cookies:
if cookie['httponly'] is True:
cookie['rest'] = { 'httpOnly': None }
if 'expiry' in cookie:
cookie['expire_time'] = cookie['expiry']
self.extractor._set_cookie(**cookie)
def get(self, url, html=None, video_id=None, note=None, note2='Executing JS on webpage', headers={}, jscode='saveAndExit();'): def get(self, url, html=None, video_id=None, note=None, note2='Executing JS on webpage', headers={}, jscode='saveAndExit();'):
""" """
Downloads webpage (if needed) and executes JS Downloads webpage (if needed) and executes JS
@ -3765,6 +3820,8 @@ def get(self, url, html=None, video_id=None, note=None, note2='Executing JS on w
with open(self._TMP_FILES['html'].name, 'wb') as f: with open(self._TMP_FILES['html'].name, 'wb') as f:
f.write(html.encode('utf-8')) f.write(html.encode('utf-8'))
self._save_cookies(url)
replaces = self.options replaces = self.options
replaces['url'] = url replaces['url'] = url
user_agent = headers.get('User-Agent') or std_headers['User-Agent'] user_agent = headers.get('User-Agent') or std_headers['User-Agent']
@ -3791,5 +3848,8 @@ def get(self, url, html=None, video_id=None, note=None, note2='Executing JS on w
+ encodeArgument(err)) + encodeArgument(err))
with open(self._TMP_FILES['html'].name, 'rb') as f: with open(self._TMP_FILES['html'].name, 'rb') as f:
html = f.read().decode('utf-8') html = f.read().decode('utf-8')
self._load_cookies()
return (html, encodeArgument(out)) return (html, encodeArgument(out))