183 lines
5.4 KiB
Python
Executable File
183 lines
5.4 KiB
Python
Executable File
"""
|
|
This module contains useful utility methods.
|
|
"""
|
|
import codecs
|
|
import re
|
|
import requests
|
|
|
|
|
|
def get_domain_url(url):
|
|
"""
|
|
Use this to convert a url like this:
|
|
https://blog.xkcd.com/2014/07/22/what-if-book-tour/
|
|
Into this:
|
|
https://blog.xkcd.com
|
|
"""
|
|
if "http://" not in url and "https://" not in url:
|
|
return url
|
|
url_header = url.split('://')[0]
|
|
simple_url = url.split('://')[1]
|
|
base_url = simple_url.split('/')[0]
|
|
domain_url = url_header + '://' + base_url
|
|
return domain_url
|
|
|
|
|
|
def is_xpath_selector(selector):
|
|
"""
|
|
A basic method to determine if a selector is an xpath selector.
|
|
"""
|
|
if (selector.startswith('/') or selector.startswith('./') or (
|
|
selector.startswith('('))):
|
|
return True
|
|
return False
|
|
|
|
|
|
def is_link_text_selector(selector):
|
|
"""
|
|
A basic method to determine if a selector is a link text selector.
|
|
"""
|
|
if (selector.startswith('link=') or selector.startswith('link_text=')):
|
|
return True
|
|
return False
|
|
|
|
|
|
def get_link_text_from_selector(selector):
|
|
"""
|
|
A basic method to get the link text from a link text selector.
|
|
"""
|
|
if selector.startswith('link='):
|
|
return selector.split('link=')[1]
|
|
elif selector.startswith('link_text='):
|
|
return selector.split('link_text=')[1]
|
|
return selector
|
|
|
|
|
|
def is_valid_url(url):
|
|
regex = re.compile(
|
|
r'^(?:http)s?://' # http:// or https://
|
|
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+'
|
|
r'(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' # domain...
|
|
r'localhost|' # localhost...
|
|
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
|
|
r'(?::\d+)?' # optional port
|
|
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
|
|
if regex.match(url) or url == 'about:blank' or url == 'data:,':
|
|
return True
|
|
else:
|
|
return False
|
|
|
|
|
|
def _get_unique_links(page_url, soup):
|
|
"""
|
|
Returns all unique links.
|
|
Includes:
|
|
"a"->"href", "img"->"src", "link"->"href", and "script"->"src" links.
|
|
"""
|
|
if "http://" not in page_url and "https://" not in page_url:
|
|
return []
|
|
prefix = 'http:'
|
|
if page_url.startswith('https:'):
|
|
prefix = 'https:'
|
|
simple_url = page_url.split('://')[1]
|
|
base_url = simple_url.split('/')[0]
|
|
full_base_url = prefix + "//" + base_url
|
|
|
|
raw_links = []
|
|
raw_unique_links = []
|
|
|
|
# Get "href" from all "a" tags
|
|
links = soup.find_all('a')
|
|
for link in links:
|
|
raw_links.append(link.get('href'))
|
|
|
|
# Get "src" from all "img" tags
|
|
img_links = soup.find_all('img')
|
|
for img_link in img_links:
|
|
raw_links.append(img_link.get('src'))
|
|
|
|
# Get "href" from all "link" tags
|
|
links = soup.find_all('link')
|
|
for link in links:
|
|
raw_links.append(link.get('href'))
|
|
|
|
# Get "src" from all "script" tags
|
|
img_links = soup.find_all('script')
|
|
for img_link in img_links:
|
|
raw_links.append(img_link.get('src'))
|
|
|
|
for link in raw_links:
|
|
if link not in raw_unique_links:
|
|
raw_unique_links.append(link)
|
|
|
|
unique_links = []
|
|
for link in raw_unique_links:
|
|
if link and len(link) > 1:
|
|
if link.startswith('//'):
|
|
link = prefix + link
|
|
elif link.startswith('/'):
|
|
link = full_base_url + link
|
|
elif link.startswith('./'):
|
|
link = full_base_url + link[1:]
|
|
elif link.startswith('#'):
|
|
link = full_base_url + link
|
|
elif '//' not in link:
|
|
link = full_base_url + "/" + link
|
|
else:
|
|
pass
|
|
unique_links.append(link)
|
|
|
|
return unique_links
|
|
|
|
|
|
def _get_link_status_code(link, allow_redirects=False, timeout=5):
|
|
""" Get the status code of a link.
|
|
If the timeout is exceeded, will return a 404.
|
|
For a list of available status codes, see:
|
|
https://en.wikipedia.org/wiki/List_of_HTTP_status_codes
|
|
"""
|
|
status_code = None
|
|
try:
|
|
response = requests.get(
|
|
link, allow_redirects=allow_redirects, timeout=timeout)
|
|
status_code = response.status_code
|
|
except Exception:
|
|
status_code = 404
|
|
return status_code
|
|
|
|
|
|
def _print_unique_links_with_status_codes(page_url, soup):
|
|
""" Finds all unique links in the html of the page source
|
|
and then prints out those links with their status codes.
|
|
Format: ["link" -> "status_code"] (per line)
|
|
Page links include those obtained from:
|
|
"a"->"href", "img"->"src", "link"->"href", and "script"->"src".
|
|
"""
|
|
links = _get_unique_links(page_url, soup)
|
|
for link in links:
|
|
status_code = _get_link_status_code(link)
|
|
print(link, " -> ", status_code)
|
|
|
|
|
|
def _download_file_to(file_url, destination_folder, new_file_name=None):
|
|
if new_file_name:
|
|
file_name = new_file_name
|
|
else:
|
|
file_name = file_url.split('/')[-1]
|
|
r = requests.get(file_url)
|
|
with open(destination_folder + '/' + file_name, "wb") as code:
|
|
code.write(r.content)
|
|
|
|
|
|
def _save_data_as(data, destination_folder, file_name):
|
|
out_file = codecs.open(destination_folder + '/' + file_name, "w+")
|
|
out_file.writelines(data)
|
|
out_file.close()
|
|
|
|
|
|
def make_css_match_first_element_only(selector):
|
|
# Only get the first match
|
|
last_syllable = selector.split(' ')[-1]
|
|
if ':' not in last_syllable and ':contains' not in selector:
|
|
selector += ':first'
|
|
return selector
|