Let's plot random numbers
packages = [
"numpy",
"matplotlib"
]
import sys
import requests
import csv
import time
from prettytable import PrettyTable
from bs4 import BeautifulSoup
import pandas as pd
ID_BEGIN=12912
ID_END=20000
OUT_FILENAME="rstv.csv"
COOKIE="cookiesession1=678B287CE314F41A78856D89EB582510; ASP.NET_SessionId=1t5jnf1bjfatnhwtrqxyuqqb"
prefix='cphBody_cphBody_ux'
ignoreList = ['cphBody_cphBody_uxTitle']
# Url of website
#url="https://rstc.cic.hk/rstc/subcontractorDetail.aspx?id=R000032"
url_format="https://www.cedd.gov.hk/tc/tender-notices/contracts/contracts-awarded/index.html"
def get_table_headers(table):
"""Given a table soup, returns all the headers"""
headers = []
i = 0
for th in table.find("tr").find_all("th"):
if i == 2:
headers.append(th.text.strip())
i += 1
return headers
def get_table_rows(table):
"""Given a table, returns all its rows"""
rows = []
for tr in table.find_all("tr")[1:]:
cells = []
# grab all td tags in this table row
tds = tr.find_all("td")
#print(f"tds: '{tds}'")
if len(tds) == 0:
# if no td tags, search for th tags
# can be found especially in wikipedia tables below the table
ths = tr.find_all("th")
for th in ths:
cells.append(th.text.strip())
else:
# use regular td tags
idx = 0
for td in tds:
#print(f"idx: '{idx}'")
hlink = td.find('a', href=True)
if hlink:
#print(f"hlink: '{hlink['href']}'")
url_sub = "https://www.cedd.gov.hk" + hlink['href']
#print(f"url_sub: '{url_sub}'")
sub_rawdata = requests.get(url_sub, timeout=3)
sub_html = sub_rawdata.content
sub_soup = BeautifulSoup(sub_html, 'html.parser')
sub_content = sub_soup.find("div", {"id": "content"})
#print(f"h3_entry: '{sub_content.next_sibling}'")
h3 = []
h3_data = []
i = 0
for h3_entry in sub_content.find_all("h3"):
if i != 3:
#print(f"h3_entry: '{h3_entry.next_sibling}'")
h3_data.append(h3_entry.next_sibling)
h3.append(h3_entry.text.strip())
i += 1
#print(f"h3: '{h3}'")
if idx == 2:
#print(f"data: '{td.text.strip()}'")
cells.append(td.text.strip())
idx += 1
cells = cells + h3_data
#print(f"cells: '{cells}'")
rows.append(cells)
return rows, h3
def save_as_csv(headers, rows):
pd.DataFrame(rows, columns=headers).to_csv(f"cedd.csv")
def parse_html(url):
print(f"Processing '{url}'...")
try:
rawdata = requests.get(url, timeout=3)
#print(f"html: '{rawdata}'")
#rawdata=requests.get(url)
html=rawdata.content
#print(f"html: '{rawdata.content}'")
except:
return None, None
else:
print(f"URL OK")
#return None, None
# Parsing html content with beautifulsoup
soup = BeautifulSoup(html, 'html.parser')
table = soup.table
#print(f"table: '{table}'")
headers = get_table_headers(table)
rows, detail_header= get_table_rows(table)
headers = headers + detail_header
#print(f"headers: '{headers}'")
#print(f"rows: '{rows}'")
save_as_csv(headers, rows)
return None, None
if __name__ == '__main__':
#for idx in range(ID_BEGIN, ID_END + 1):
url = url_format
headers, company = parse_html(url)
print(f"Done...")