Let's plot random numbers

❰py❱

packages = [ "numpy", "matplotlib" ] import sys import requests import csv import time from prettytable import PrettyTable from bs4 import BeautifulSoup import pandas as pd ID_BEGIN=12912 ID_END=20000 OUT_FILENAME="rstv.csv" COOKIE="cookiesession1=678B287CE314F41A78856D89EB582510; ASP.NET_SessionId=1t5jnf1bjfatnhwtrqxyuqqb" prefix='cphBody_cphBody_ux' ignoreList = ['cphBody_cphBody_uxTitle'] # Url of website #url="https://rstc.cic.hk/rstc/subcontractorDetail.aspx?id=R000032" url_format="https://www.cedd.gov.hk/tc/tender-notices/contracts/contracts-awarded/index.html" def get_table_headers(table): """Given a table soup, returns all the headers""" headers = [] i = 0 for th in table.find("tr").find_all("th"): if i == 2: headers.append(th.text.strip()) i += 1 return headers def get_table_rows(table): """Given a table, returns all its rows""" rows = [] for tr in table.find_all("tr")[1:]: cells = [] # grab all td tags in this table row tds = tr.find_all("td") #print(f"tds: '{tds}'") if len(tds) == 0: # if no td tags, search for th tags # can be found especially in wikipedia tables below the table ths = tr.find_all("th") for th in ths: cells.append(th.text.strip()) else: # use regular td tags idx = 0 for td in tds: #print(f"idx: '{idx}'") hlink = td.find('a', href=True) if hlink: #print(f"hlink: '{hlink['href']}'") url_sub = "https://www.cedd.gov.hk" + hlink['href'] #print(f"url_sub: '{url_sub}'") sub_rawdata = requests.get(url_sub, timeout=3) sub_html = sub_rawdata.content sub_soup = BeautifulSoup(sub_html, 'html.parser') sub_content = sub_soup.find("div", {"id": "content"}) #print(f"h3_entry: '{sub_content.next_sibling}'") h3 = [] h3_data = [] i = 0 for h3_entry in sub_content.find_all("h3"): if i != 3: #print(f"h3_entry: '{h3_entry.next_sibling}'") h3_data.append(h3_entry.next_sibling) h3.append(h3_entry.text.strip()) i += 1 #print(f"h3: '{h3}'") if idx == 2: #print(f"data: '{td.text.strip()}'") cells.append(td.text.strip()) idx += 1 cells = cells + h3_data #print(f"cells: '{cells}'") rows.append(cells) return rows, h3 def save_as_csv(headers, rows): pd.DataFrame(rows, columns=headers).to_csv(f"cedd.csv") def parse_html(url): print(f"Processing '{url}'...") try: rawdata = requests.get(url, timeout=3) #print(f"html: '{rawdata}'") #rawdata=requests.get(url) html=rawdata.content #print(f"html: '{rawdata.content}'") except: return None, None else: print(f"URL OK") #return None, None # Parsing html content with beautifulsoup soup = BeautifulSoup(html, 'html.parser') table = soup.table #print(f"table: '{table}'") headers = get_table_headers(table) rows, detail_header= get_table_rows(table) headers = headers + detail_header #print(f"headers: '{headers}'") #print(f"rows: '{rows}'") save_as_csv(headers, rows) return None, None if __name__ == '__main__': #for idx in range(ID_BEGIN, ID_END + 1): url = url_format headers, company = parse_html(url) print(f"Done...")