有高手可以帮帮我吗?为什么抓取网页总是显示为None.而且我用urllib库抓取网页时,IDEL总是什么也不显示。不知道是什么回事?
def get_page(url):
try:
import urllib
return urllib.urlopen(url).read()
except:
return ''
print get_page("http://www.baidu.com")
def get_next_target(page):
start_link = page.find('<a href=')
if start_link == -1:
return None,0
start_quote = page.find('"',start_link)
end_quote = page.find('"',start_quote + 1)
url = page[start_quote + 1:end_quote]
return url,end_quote
print get_next_target('this is a <a href="http://udacity.com">link!</a>')
url,endpos = get_next_target('this is a <a href="http://udacity.com">link!</a>')
print url,endpos
def print_all_links(page):
while True:
url,endpos = get_next_target(page)
if url:
print url
page = page[endpos:]
else:
break
print_all_links('this <a href="test1">link 1</a> is <a href="test2">link 2</a> is <a #href="test3">link 3</a>')
print print_all_links(get_page('https://www.nvovn.com/'))