SeleniumBase/seleniumbase/fixtures/page_utils.py

183 lines
5.4 KiB
Python
Executable File

"""
This module contains useful utility methods.
"""
import codecs
import re
import requests
def get_domain_url(url):
"""
Use this to convert a url like this:
https://blog.xkcd.com/2014/07/22/what-if-book-tour/
Into this:
https://blog.xkcd.com
"""
if "http://" not in url and "https://" not in url:
return url
url_header = url.split('://')[0]
simple_url = url.split('://')[1]
base_url = simple_url.split('/')[0]
domain_url = url_header + '://' + base_url
return domain_url
def is_xpath_selector(selector):
"""
A basic method to determine if a selector is an xpath selector.
"""
if (selector.startswith('/') or selector.startswith('./') or (
selector.startswith('('))):
return True
return False
def is_link_text_selector(selector):
"""
A basic method to determine if a selector is a link text selector.
"""
if (selector.startswith('link=') or selector.startswith('link_text=')):
return True
return False
def get_link_text_from_selector(selector):
"""
A basic method to get the link text from a link text selector.
"""
if selector.startswith('link='):
return selector.split('link=')[1]
elif selector.startswith('link_text='):
return selector.split('link_text=')[1]
return selector
def is_valid_url(url):
regex = re.compile(
r'^(?:http)s?://' # http:// or https://
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+'
r'(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' # domain...
r'localhost|' # localhost...
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
r'(?::\d+)?' # optional port
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
if regex.match(url) or url == 'about:blank' or url == 'data:,':
return True
else:
return False
def _get_unique_links(page_url, soup):
"""
Returns all unique links.
Includes:
"a"->"href", "img"->"src", "link"->"href", and "script"->"src" links.
"""
if "http://" not in page_url and "https://" not in page_url:
return []
prefix = 'http:'
if page_url.startswith('https:'):
prefix = 'https:'
simple_url = page_url.split('://')[1]
base_url = simple_url.split('/')[0]
full_base_url = prefix + "//" + base_url
raw_links = []
raw_unique_links = []
# Get "href" from all "a" tags
links = soup.find_all('a')
for link in links:
raw_links.append(link.get('href'))
# Get "src" from all "img" tags
img_links = soup.find_all('img')
for img_link in img_links:
raw_links.append(img_link.get('src'))
# Get "href" from all "link" tags
links = soup.find_all('link')
for link in links:
raw_links.append(link.get('href'))
# Get "src" from all "script" tags
img_links = soup.find_all('script')
for img_link in img_links:
raw_links.append(img_link.get('src'))
for link in raw_links:
if link not in raw_unique_links:
raw_unique_links.append(link)
unique_links = []
for link in raw_unique_links:
if link and len(link) > 1:
if link.startswith('//'):
link = prefix + link
elif link.startswith('/'):
link = full_base_url + link
elif link.startswith('./'):
link = full_base_url + link[1:]
elif link.startswith('#'):
link = full_base_url + link
elif '//' not in link:
link = full_base_url + "/" + link
else:
pass
unique_links.append(link)
return unique_links
def _get_link_status_code(link, allow_redirects=False, timeout=5):
""" Get the status code of a link.
If the timeout is exceeded, will return a 404.
For a list of available status codes, see:
https://en.wikipedia.org/wiki/List_of_HTTP_status_codes
"""
status_code = None
try:
response = requests.get(
link, allow_redirects=allow_redirects, timeout=timeout)
status_code = response.status_code
except Exception:
status_code = 404
return status_code
def _print_unique_links_with_status_codes(page_url, soup):
""" Finds all unique links in the html of the page source
and then prints out those links with their status codes.
Format: ["link" -> "status_code"] (per line)
Page links include those obtained from:
"a"->"href", "img"->"src", "link"->"href", and "script"->"src".
"""
links = _get_unique_links(page_url, soup)
for link in links:
status_code = _get_link_status_code(link)
print(link, " -> ", status_code)
def _download_file_to(file_url, destination_folder, new_file_name=None):
if new_file_name:
file_name = new_file_name
else:
file_name = file_url.split('/')[-1]
r = requests.get(file_url)
with open(destination_folder + '/' + file_name, "wb") as code:
code.write(r.content)
def _save_data_as(data, destination_folder, file_name):
out_file = codecs.open(destination_folder + '/' + file_name, "w+")
out_file.writelines(data)
out_file.close()
def make_css_match_first_element_only(selector):
# Only get the first match
last_syllable = selector.split(' ')[-1]
if ':' not in last_syllable and ':contains' not in selector:
selector += ':first'
return selector