Py学习  »  akin_ai  »  全部回复
回复总数  1

拜托!不要在regex中解析HTML使用bs4这样的模块。但如果你坚持这样做的话:

import requests
import re

url = 'https://en.wikipedia.org/wiki/Collatz_conjecture'
text = requests.get(url).text
tags = re.findall('<[^>]*>',text)

total=[]

for i in range(len(tags)):
    total.append(re.match('<[^\s\>]+',tags[i]).group())

total=[elem+'>' for elem in total]
r= re.compile('</[^<]')

unwanted =list(filter(r.match,total))

un=['<!-->','<!--[if>','<!DOCTYPE>','<![endif]-->']
unwanted.extend(un)

final=[x for x in list(set(total)) if x not in set(unwanted)]

print('Number of Unique HTML tags : ',len(final))