In this example we will show how to scrape the data on the web with BeautifulSoup library in Python.
Before building the crawler, we need to install a few related libraries:
pip install requests
pip install BeautifulSoup4
pip install lxml
Source Code
#! /usr/bin/env python3
# -*- coding: utf-8 -*-
# crawlTable.py
# Crawl data from table in Python
# pip install requests
# pip install BeautifulSoup4
# pip install lxml
import requests
from bs4 import BeautifulSoup
import bs4
# Get text data from url
def getHtmlText(url):
try:
r = requests.get(url, timeout = 30)
r.raise_for_status()
return r.text
except:
return 'Fail to crawl'
# Fill the table
def fillCompList(clist, html):
soup = BeautifulSoup(html, 'lxml')
listTr = soup.find('tbody').find_all('tr')
for tr in listTr[1:]:
ths = tr.find('th')
tds = tr.find_all('td')
# add the record on rank, name and revenue
clist.append([ths.string.strip(), tds[0].text.strip(), tds[2].text.strip()])
# output data
def printCompList(clist, num):
tplt = '{:<10}\t{:<20}\t{:<20}'
# define template of output,t for tab,< for align left,10 for the width
print(tplt.format('Rank', 'Name', 'Revenue(USD millions)'))
for i in range(num):
c = clist[i]
print(tplt.format(c[0],c[1],c[2]))
def main():
cinfo = []
url = 'https://en.wikipedia.org/wiki/List_of_largest_companies_by_revenue'
html = getHtmlText(url)
fillCompList(cinfo, html)
printCompList(cinfo, 10) # select top 10 records
if __name__ == '__main__':
main()
After running the codes, we can get the results as follows. They include the top 10 companies ranked by revenue.