"""A program to quickly extract links from a url"""
import asyncio
from itertools import chain
import re
from bs4 import BeautifulSoup
import aiohttp
def _format_base_url(base_url: str):
"""Properly format URL to start with protocol and end with slash.
Parameters
---------
base_url : str
the original URL supplied
Returns
-------
str :
url with format like `https://.../`
"""
base_url = 'https://' + \
base_url if not base_url.startswith(
('http://', 'https://')) else base_url
base_url = base_url + '/' if not base_url.endswith('/') else base_url
return base_url
async def _async_get_html(base_url: str, ssl: bool = None):
"""Get HTML for a url.
Parameters
----------
base_url : str
the original URL supplied
ssl : str
SSL validation mode. default is False
if False then skip SSL certificate validation
Returns
-------
str :
HTML for base_url
"""
if ssl is None:
ssl = False
# may need to add this to ClientSession() connector=aiohttp.TCPConnector(ssl=False)
conn = aiohttp.TCPConnector(ssl=ssl)
async with aiohttp.ClientSession(connector=conn, trust_env=True) as client:
async with client.get(base_url) as resp: # ssl=False
return await resp.text() if (resp.status == 200) else ""
def _get_links(html_page: str):
"""Tets all links from HTML.
Parameters
----------
html_page : str)
document html
Returns
-------
list :
list of all the links in the html document
(these could be files or sub-directories)
"""
# "lxml" supposed to be faster than "html.parser
soup = BeautifulSoup(html_page, "html.parser")
regex = ".|(/$)"
links = [f"{link.get('href')}"
for link
in soup.findAll('a', attrs={'href': re.compile(regex)})]
return links
def _get_sub_dirs(links: list, base_url: str):
"""Gets sub-directories from list of links.
Parameters
----------
links : list
list of links, contains files and sub-directories
base_url : str
the original URL supplied
Returns
-------
list :
only the links that point to sub-directories are returned
"""
sub_dirs = [f"{base_url}{link}" for link in links if re.search(r'/$', link)]
return sub_dirs
def _get_files(links: list, regex: str = None):
"""Gets files from list of links.
Parameters
----------
links : list
list of links to files and sub-directories
regex : str
filter links based on a regular expression
Returns
-------
list :
only the links that point to files are returned
"""
if regex is None:
regex = r'[^/]$'
file_links = [link for link in links if re.search(regex, link)]
return file_links
def _filter_with_regex(links: list, regex: str):
"""Filters files by regular expressions.
Parameters
----------
links : list
list of links to files and sub-directories
regex : str
regular expression string
Returns
-------
list :
a list of links with regular expression applied
"""
return [link for link in links if re.search(regex, link)]
def _prepend_with_baseurl(links: list, base_url: str):
"""prepend url to beginning of each file
Parameters
----------
links : list
list of links to files and sub-directories
base_url : str
base url
Returns
------
list :
a list of links with base url pre-pended
"""
return [base_url + link for link in links]
async def _gather_with_concurrency(n: int, *tasks):
"""Limits open files to avoid 'too many open files' error.
Parameters
----------
n : int
Number of files to open at once
tasks : list
list of tasks to gather output from
Returns
-------
awaitable :
gathered coroutines that need to awaited
Notes
-----
https://stackoverflow.com/questions/48483348/how-to-limit-concurrency-with-python-asyncio/61478547#61478547
"""
semaphore = asyncio.Semaphore(n)
async def sem_task(task):
async with semaphore:
return await task
return await asyncio.gather(*(sem_task(task) for task in tasks))
async def _async_link_extractor(base_url: str, search_subs: bool = None, regex: str = None, *args, **kwargs):
"""Asyncronous extract links from URL.
Parameters
----------
base_url : str
URL you want to search
seach_subs : bool
True is want to search sub-directories
regex :str
filter links based on a regular expression
Returns
-------
list :
list of files
"""
files = []
base_url = _format_base_url(base_url)
html_page = await _async_get_html(base_url)
links = _get_links(html_page=html_page)
sub_dirs = _get_sub_dirs(links, base_url)
filenames = _get_files(links, regex=regex)
base_files = _prepend_with_baseurl(filenames, base_url)
files.extend(base_files)
# gathers files from sub-directories
if search_subs:
coros = [_async_link_extractor(sub) for sub in sub_dirs]
new_files = await _gather_with_concurrency(200, *coros)
files.extend(chain(*new_files))
if regex is not None:
files = _filter_with_regex(files, regex)
return files