Let's plot random numbers
    
    
      packages = [
       "numpy",
       "matplotlib"
      ]
    
    
import sys
import requests
import csv
import time
from prettytable import PrettyTable
from bs4 import BeautifulSoup
import pandas as pd
ID_BEGIN=12912
ID_END=20000
OUT_FILENAME="rstv.csv"
COOKIE="cookiesession1=678B287CE314F41A78856D89EB582510; ASP.NET_SessionId=1t5jnf1bjfatnhwtrqxyuqqb"
prefix='cphBody_cphBody_ux'
ignoreList = ['cphBody_cphBody_uxTitle']
# Url of website
#url="https://rstc.cic.hk/rstc/subcontractorDetail.aspx?id=R000032"
url_format="https://www.cedd.gov.hk/tc/tender-notices/contracts/contracts-awarded/index.html"
def get_table_headers(table):
    """Given a table soup, returns all the headers"""
    headers = []
    i = 0
    for th in table.find("tr").find_all("th"):
        if i == 2:
            headers.append(th.text.strip())
        i += 1
    return headers
def get_table_rows(table):
    """Given a table, returns all its rows"""
    rows = []
    for tr in table.find_all("tr")[1:]:
        cells = []
        # grab all td tags in this table row
        tds = tr.find_all("td")
        #print(f"tds: '{tds}'")
        if len(tds) == 0:
            # if no td tags, search for th tags
            # can be found especially in wikipedia tables below the table
            ths = tr.find_all("th")
            for th in ths:
                cells.append(th.text.strip())
        else:
            # use regular td tags
            idx = 0
            for td in tds:
                #print(f"idx: '{idx}'")
                hlink = td.find('a', href=True)
                if hlink:
                    #print(f"hlink: '{hlink['href']}'")
                    url_sub = "https://www.cedd.gov.hk" + hlink['href']
                    #print(f"url_sub: '{url_sub}'")
                    sub_rawdata = requests.get(url_sub, timeout=3)
                    sub_html = sub_rawdata.content
                    sub_soup = BeautifulSoup(sub_html, 'html.parser')
                    sub_content = sub_soup.find("div", {"id": "content"})
                    #print(f"h3_entry: '{sub_content.next_sibling}'")
                    
                    h3 = []
                    h3_data = []
                    i = 0
                    for h3_entry in sub_content.find_all("h3"):
                        if i != 3:
                            #print(f"h3_entry: '{h3_entry.next_sibling}'")
                            h3_data.append(h3_entry.next_sibling)
                            h3.append(h3_entry.text.strip())
                        i += 1
                    #print(f"h3: '{h3}'")
                if idx == 2:
                    #print(f"data: '{td.text.strip()}'")
                    cells.append(td.text.strip())
                idx += 1
            cells = cells + h3_data
            #print(f"cells: '{cells}'")
        rows.append(cells)
    return rows, h3
def save_as_csv(headers, rows):
    pd.DataFrame(rows, columns=headers).to_csv(f"cedd.csv")
def parse_html(url):
    print(f"Processing '{url}'...")
    try:
        rawdata = requests.get(url, timeout=3)
        #print(f"html: '{rawdata}'")
        #rawdata=requests.get(url)
        html=rawdata.content
        #print(f"html: '{rawdata.content}'")
    except:
        return None, None
    else:
        print(f"URL OK")
        #return None, None
    # Parsing html content with beautifulsoup
        soup = BeautifulSoup(html, 'html.parser')
        table = soup.table
        #print(f"table: '{table}'")
        headers = get_table_headers(table)
        rows, detail_header= get_table_rows(table)
        headers = headers + detail_header
        #print(f"headers: '{headers}'")
        #print(f"rows: '{rows}'")
        save_as_csv(headers, rows)
        
    return None, None
if __name__ == '__main__':
    #for idx in range(ID_BEGIN, ID_END + 1):
    url = url_format
    headers, company = parse_html(url)
    print(f"Done...")