In this example we will show how to scrape the data on the web with BeautifulSoup library in Python.
Before building the crawler, we need to install a few related libraries:
pip install requests
pip install BeautifulSoup4
pip install lxml
data:image/s3,"s3://crabby-images/6c1d3/6c1d32540a68ef649aa67abd0d2a65827674c3d9" alt=""
data:image/s3,"s3://crabby-images/f37b4/f37b4818adc05f65059a8df21dfce39fd6e0d215" alt=""
data:image/s3,"s3://crabby-images/527fd/527fdb0641351bb97997bd719541580a7a66a2ba" alt=""
Source Code
#! /usr/bin/env python3
# -*- coding: utf-8 -*-
# crawlTable.py
# Crawl data from table in Python
# pip install requests
# pip install BeautifulSoup4
# pip install lxml
import requests
from bs4 import BeautifulSoup
import bs4
# Get text data from url
def getHtmlText(url):
try:
r = requests.get(url, timeout = 30)
r.raise_for_status()
return r.text
except:
return 'Fail to crawl'
# Fill the table
def fillCompList(clist, html):
soup = BeautifulSoup(html, 'lxml')
listTr = soup.find('tbody').find_all('tr')
for tr in listTr[1:]:
ths = tr.find('th')
tds = tr.find_all('td')
# add the record on rank, name and revenue
clist.append([ths.string.strip(), tds[0].text.strip(), tds[2].text.strip()])
# output data
def printCompList(clist, num):
tplt = '{:<10}\t{:<20}\t{:<20}'
# define template of output,t for tab,< for align left,10 for the width
print(tplt.format('Rank', 'Name', 'Revenue(USD millions)'))
for i in range(num):
c = clist[i]
print(tplt.format(c[0],c[1],c[2]))
def main():
cinfo = []
url = 'https://en.wikipedia.org/wiki/List_of_largest_companies_by_revenue'
html = getHtmlText(url)
fillCompList(cinfo, html)
printCompList(cinfo, 10) # select top 10 records
if __name__ == '__main__':
main()
After running the codes, we can get the results as follows. They include the top 10 companies ranked by revenue.
data:image/s3,"s3://crabby-images/1fad8/1fad86f4b093c4f6eb373b004e3d5b5f8e3f615b" alt=""