拜托!不要在regex中解析HTML使用bs4这样的模块。但如果你坚持这样做的话:
import requests
import re
url = 'https://en.wikipedia.org/wiki/Collatz_conjecture'
text = requests.get(url).text
tags = re.findall('<[^>]*>',text)
total=[]
for i in range(len(tags)):
total.append(re.match('<[^\s\>]+',tags[i]).group())
total=[elem+'>' for elem in total]
r= re.compile('</[^<]')
unwanted =list(filter(r.match,total))
un=['<!-->','<!--[if>','<!DOCTYPE>','<![endif]-->']
unwanted.extend(un)
final=[x for x in list(set(total)) if x not in set(unwanted)]
print('Number of Unique HTML tags : ',len(final))