This is the first dataset that uses classes and all the others will probably migrate over
QU7MDQED456DFUKHLHCS2SS5QSLDYFU233YNQ27KY2TOVDMTSUKAC def remove_quoted_commas(line):inside_quote = Falsecleaned_quote = ""for character in line:if character == '"':inside_quote = not inside_quoteelif character == ",":if not inside_quote:cleaned_quote += characterelse:cleaned_quote += character# Make sure no quotes are unclosedassert not inside_quotereturn cleaned_quote# Base dataset functionality, can be used to see broad historical trends from 1996-2021class CensusDataset:def census_year(self):raise NotImplementedErrordef median_incomes(self):raise NotImplementedErrordef median_rents(self):raise NotImplementedErrorclass Census1996(CensusDataset):def __init__(self, filename):file = open(filename, "r").read()# Split the file up into sectionsself.sections = []# Each section is separated by this stringfor section in file.split(",,,,,,,,,,,,,"):section = section.strip()# Some lines are just section breaks, so ignore thoseif section != "":self.sections.append(section)self.WEEKLY_RENT_HEADING = self.sections[4]self.POPULATION_BY_INCOME = self.sections[5]# Extract the median rent (midpoint between range) for each columnmedian_rents = self.median_rents()# Extract the median household income (midpoint between each range) for each bracketmedian_incomes = self.median_incomes()# Extract the actual tablepopulation_data = self.population_data()from pprint import pprintpprint(population_data)def census_year(self):return 1996def filter_rent_brackets(self):# Remove commas inside of quotes as to not break parsingweekly_rent_heading = remove_quoted_commas(self.WEEKLY_RENT_HEADING)assert weekly_rent_heading.startswith(",,,,,, Weekly rent,,,,,,,")assert weekly_rent_heading.count("\n") == 2# The headings are split between 2 lines (using example $0-$99):# 1st line: start of range (eg $0-)# 2nd line: end of range (eg $99)[start_ranges, end_ranges] = weekly_rent_heading.splitlines()[1:]# Remove comma prefix and split into columnsstart_ranges = start_ranges[1:].split(",")end_ranges = end_ranges[1:].split(",")
# Remove the last 3 columns:# 1. $1000 or more# 2. Not stated# 3. Totalassert start_ranges[-3:] == ["$1000", "Not", ""]assert end_ranges[-3:] == ["or more", "stated", "Total"]start_ranges = start_ranges[:-3]end_ranges = end_ranges[:-3]return (start_ranges, end_ranges)def parse_rent_bracket(self, start, end):assert start.startswith("$")assert start.endswith("-")assert end.startswith("$")# Strip '$' prefix and '-' suffixstart = start[1:-1]# Strip '$' prefixend = end[1:]# Make sure we have filtered out bad categories such as 'Not stated'assert should_keep(start + end)# Downcast ranges and calculate midpointstart = int(start)end = int(end)return (start, end)def median_rents(self):(start_ranges, end_ranges) = self.filter_rent_brackets()# Extract the ranges and store as the middle point (eg $50 for $0-$99)rents = []for column in range(len(start_ranges)):start = start_ranges[column]end = end_ranges[column](start, end) = self.parse_rent_bracket(start, end)rents.append((start + end + 1) // 2)return rentsdef filter_income_brackets(self):rows = self.POPULATION_BY_INCOME.splitlines()# Remove rows containing irrelevant dataassert rows[0].startswith("Negative income")assert rows[1].startswith("Nil income")assert rows[-3].startswith('"$2,000 or more"')assert rows[-2].startswith("Partial income stated(a)")assert rows[-1].startswith("All incomes not stated(b)")rows = rows[2:-3]return rowsdef extract_income_columns(self, line):# Convert '"$1,000"' -> '$1000'line = remove_quoted_commas(line)# Split into columnsline = line.split(",")return linedef median_incomes(self):incomes = []for line in self.filter_income_brackets():line = self.extract_income_columns(line)# Two dollar amounts separated by a hyphen ('-')# row[0] will be values such as: '$1-$39'[start, end] = line[0].split("-")# Make sure we've got data in the right shapeassert start.startswith("$")assert end.startswith("$")assert should_keep(line[0])# Strip prefixesstart = start[1:]end = end[1:]# Downcast and calculate midpointstart = int(start)end = int(end)incomes.append((start + end + 1) // 2)return incomesdef population_data(self):populations = []for line in self.filter_income_brackets():# We don't care about income brackets, only population datacolumns = self.extract_income_columns(line)[1:]columns_for_line = []for column in columns:columns_for_line.append(int(column))populations.append(columns_for_line)return populations