JGCJ2CQPYNRNLDQGVRIDIB5EGKW2O7PCTAIHKZTTJQXE2VTY27QQC NMT3JJPPFHOOL37U6JJIUSBCL5C6BRLLH6CDO3TS5YKRDJPK6NFAC 2AE4VE7UAQOEXBTOWOSXSEC2Q5PII4PNZH6MZFCLGNY3LGJOGKIAC 7ZN6HHL2PQTLSZCD3AHXWAUE5ZGNY4S5PFR2XCDYR35GFGASPDLQC QU7MDQED456DFUKHLHCS2SS5QSLDYFU233YNQ27KY2TOVDMTSUKAC QDEBWR2RIYZNPXOVOSDXPXTXOZ3NVJHRAUDHPCRZXE5DFT2VRCMQC FCHAPZLDDTY46FYACRWZOPVXYLCSUBPZVNYYGRIUO3IFC6BM2TZAC 4YKXEBAVKKIWUC6EIG4MGSTZOHD7XJSO5O4YNU3OK5SPKMIWX7YQC # Simple function that says whether or not to use a categorydef should_keep(category):# If any of these phrases are in the category, it's not allowed!bad_words = ["Neg_Ni_inc", # Negative/nil income"more", # Brackets over a certain threshold (eg $4000more)"or more","NS", # Not stated"Notstated","Tot", # Total"PI_S",]for bad_word in bad_words:if bad_word in category:# Bad word! Tell them not to use this categoryreturn False# Made it to the end without any bad words, should be finereturn True# Take a valid category and return the year, mean income & rentdef split_category(category):# Example category: 'C11_1_149_R1_74'parts = category.split("_")# Make sure it's a valid categoryif not should_keep(category):print(f"Invalid category: {category}")assert should_keep(category) is True# All valid categories have 5 partsassert len(parts) == 5# Split up the partsyear = parts[0]min_income = int(parts[1])max_income = int(parts[2])max_rent = int(parts[4])# Year starts with 'C' followed by 2 digitsassert len(year) == 3assert year[0] == "C"assert year[1:].isdigit()# `min_rent` is special because it has the 'R' prefixmin_rent = parts[3]# Make sure the first character is an 'R'assert min_rent[0] == "R"# Then we can just convert it into an int like the othersmin_rent = int(min_rent[1:])mean_income = (min_income + max_income + 1) // 2mean_rent = (min_rent + max_rent + 1) // 2return (year, mean_income, mean_rent)
for filename in files:file = open(filename, "r").readlines()# Split each row on the commafile = [[col.strip() for col in row.split(",")] for row in file]# First line in file (file[0]) is headingsfirst_line = file[0]# Don't need the first column, as it says 'STE_CODE_2021'# Example value for `categories_for_file`:# ['C11_Neg_Ni_inc_R1_74', 'C11_Neg_Ni_inc_R75_99', 'C11_Neg_Ni_inc_R100_149', etc]categories_for_file = first_line[1:]# When we need to use categories, use `categories_for_file`# The `file` variable should be JUST data, so remove the first line# Example value:"""[['1', '400', '505', etc],['2', '241', '297', etc],etc]"""file = file[1:]
def median_incomes(self):incomes = {}for column in range(len(self.categories)):category = self.categories[column]
# Get a list of all the areas (states, could later be LGAs)# Should end up looking like:# ['1', '2', '3', etc]areas = []for row in range(len(file)):# Area code is the first column of each row# Example value: '1'first_cell = file[row][0]# Add that to list of all areasareas.append(first_cell)
total_income = 0for row in range(len(self.data)):total_income += int(self.data[row][column])
# Example value: 'C11_Neg_Ni_inc_R1_74'# Split the category up into parts (separated by underscores)category = category.split("_")# Example value for category: ['C11', 'Neg', 'Ni', etc]# Year is always prefixed, so get everything before first underscore# Example value: 'C11' (first item in category)year = category[0]# Category is everything after the first underscore# Example value: 'Neg_Ni_inc_R1_74'category = "_".join(category[1:])
def median_rents(self):rents = {}for column in range(len(self.categories)):category = self.categories[column]
# Add that category to the list of categories (without year)# Example value: 'Neg_Ni_inc_R1_74' (no year included!!)# TODO: assert all categories are the same across yearscategories.append(category)
total_rent = 0for row in range(len(self.data)):total_rent += int(self.data[row][column])
# Now add every row in this columnfor row in range(len(file)):# years[year] is a dict mapping states to rows# Every key is essentially one row of data in the csv# It will end up looking something like:"""{'1': [1, 2, 3, etc],'2': [3, 1, 2, etc],etc}"""# Example value: first row (row = 0), area = '1'area = areas[row]
return rents
# Simple function that says whether or not to use a categorydef should_keep(category):# If any of these phrases are in the category, it's not allowed!bad_words = ["Neg_Ni_inc","4000more","R650more","R_NS","R_Tot","PI_S","or more","Notstated",]for bad_word in bad_words:if bad_word in category:# Bad word! Tell them not to use this categoryreturn False# Made it to the end without any bad words, should be finereturn True# Take a valid category and return the mean income & rentdef split_category(category):# Example category: '1_149_R1_74'parts = category.split("_")# Make sure it's a valid categoryif not should_keep(category):print(category)assert should_keep(category) is True# All valid categories have 4 partsassert len(parts) == 4
mean_income = round((min_income + max_income) / 2)mean_rent = round((min_rent + max_rent) / 2)return (mean_income, mean_rent)# Second step: split 2-variable categories into tabledef reshape_into_table(categories, years):# Example value: 'C11'for year in years:# Example value: '1'for area in years[year]:# TODO: categories may change between iterations# This should look like:"""[# This is income of $1-$149[# This is rent of $1-$741535,# This is rent of $75-$991580,etc],# This is income of $150-$299[# This is rent of $1-$744591,# This is rent of $75-$992188,etc],etc]"""new_data = []# The last mean income we sawlast_income = None# Make sure there are as many categories as dataassert len(years[year][area]) == len(categories)# Go through each 2-variable category and data pairfor column in range(len(categories)):# Get the relevant categorycategory = categories[column]# Check if the category is part of our datasetif should_keep(category) is True:income, rent = split_category(category)
class Census2011_2016_2021(CensusDataset):def __init__(self, files):census2011 = Nonecensus2016 = Nonecensus2021 = None
# Append our rent information to the most recent bracket# The most recent bracket will always be the one we wantcurrent_cell = years[year][area][column]current_cell = int(current_cell)new_data[-1].append(current_cell)
# First-time setup requires opening the first fileif file_index == 0:census2011 = [[] for row in range(len(file))]census2016 = [[] for row in range(len(file))]census2021 = [[] for row in range(len(file))]
# Update this from old->new datayears[year][area] = new_data
# Headings are the first rowheadings = file[0]# Make sure to skip the first column (STE_CODE_2021)for column in range(1, len(file[0])):if should_keep(headings[column]):(year, mean_income, mean_rent) = split_category(headings[column])
incomes = []rents = []for category in categories:if should_keep(category):(mean_income, mean_rent) = split_category(category)if mean_income not in incomes:incomes.append(mean_income)if mean_rent not in rents:rents.append(mean_rent)
# Add each cell to its corresponding rowfor row in range(len(file)):if row == 0:cell = (mean_income, mean_rent)else:cell = file[row][column]
# (categories, years) = merge_files([TIME_SERIES_A])# (categories, years) = reshape_into_table(categories, years)# print(categories)
self.census2011 = Census2011(census2011)self.census2016 = Census2016(census2016)self.census2021 = Census2021(census2021)
print(Census1996().median_incomes())print(Census1996().median_rents())print(Census2001().median_incomes())print(Census2001().median_rents())
# print(Census1996().median_incomes())# print(Census1996().median_rents())# print(Census2001().median_incomes())# print(Census2001().median_rents())