下面将从所讨论的URL中得到63个URL
import requests
import re
url = "https://en.wikipedia.org/wiki/Collatz_conjecture"
text = requests.get(url).text
url_pattern = r"((http(s)?://)([\w-]+\.)+[\w-]+[.com]+([\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])?)"
# Get all matching patterns of url_pattern
# this will return a list of tuples
# where we are only interested in the first item of the tuple
urls = re.findall(url_pattern, text)
# using list comprehension to get the first item of the tuple,
# and the set function to filter out duplicates
unique_urls = set([x[0] for x in urls])
print(f'Number of unique HTML tags: {len(unique_urls)} found on {url}')
输出:
Number of unique HTML tags: 63 found on https://en.wikipedia.org/wiki/Collatz_conjecture