finchie/census - Change JGCJ2CQPYNRNLDQGVRIDIB5EGKW2O7PCTAIHKZTTJQXE2VTY27QQC

Refactor time-series code

Created by finchie on September 9, 2023

JGCJ2CQPYNRNLDQGVRIDIB5EGKW2O7PCTAIHKZTTJQXE2VTY27QQC

Dependencies

In channels

main

Change contents

Insertion in project.py at line 80 [4.35]
[3.768]
[3.768]
```
    # TODO: assert table is rectangular
```

Insertion in project.py at line 101 [4.35]

[3.1171]

[5.437]


# Simple function that says whether or not to use a category
def should_keep(category):
    # If any of these phrases are in the category, it's not allowed!
    bad_words = [
        "Neg_Ni_inc",  # Negative/nil income
        "more",  # Brackets over a certain threshold (eg $4000more)
        "or more",
        "NS",  # Not stated
        "Notstated",
        "Tot",  # Total
        "PI_S",
    ]
    for bad_word in bad_words:
        if bad_word in category:
            # Bad word! Tell them not to use this category
            return False
    # Made it to the end without any bad words, should be fine
    return True
# Take a valid category and return the year, mean income & rent
def split_category(category):
    # Example category: 'C11_1_149_R1_74'
    parts = category.split("_")
    # Make sure it's a valid category
    if not should_keep(category):
        print(f"Invalid category: {category}")
    assert should_keep(category) is True
    # All valid categories have 5 parts
    assert len(parts) == 5
    # Split up the parts
    year = parts[0]
    min_income = int(parts[1])
    max_income = int(parts[2])
    max_rent = int(parts[4])
    # Year starts with 'C' followed by 2 digits
    assert len(year) == 3
    assert year[0] == "C"
    assert year[1:].isdigit()
    # `min_rent` is special because it has the 'R' prefix
    min_rent = parts[3]
    # Make sure the first character is an 'R'
    assert min_rent[0] == "R"
    # Then we can just convert it into an int like the others
    min_rent = int(min_rent[1:])
    mean_income = (min_income + max_income + 1) // 2
    mean_rent = (min_rent + max_rent + 1) // 2
    return (year, mean_income, mean_rent)

Replacement in project.py at line 335 [4.35]

∅:D[3.3959] → [4.251:337]

∅:D[5.5784] → [4.251:337]

B:BD[4.251] → [4.251:337]

def merge_files(files):
    # Set up our variables
    categories = []
    years = {}

[3.3959]

[4.337]

class TimeSeriesBase(CensusDataset):
    def __init__(self, dataset):
        self.categories = dataset[0]
        self.data = dataset[1:]

Replacement in project.py at line 340 [4.35]

B:BD[4.338] → [4.338:1186]

    for filename in files:
        file = open(filename, "r").readlines()
        # Split each row on the comma
        file = [[col.strip() for col in row.split(",")] for row in file]
        # First line in file (file[0]) is headings
        first_line = file[0]
        # Don't need the first column, as it says 'STE_CODE_2021'
        # Example value for `categories_for_file`:
        # ['C11_Neg_Ni_inc_R1_74', 'C11_Neg_Ni_inc_R75_99', 'C11_Neg_Ni_inc_R100_149', etc]
        categories_for_file = first_line[1:]
        # When we need to use categories, use `categories_for_file`
        # The `file` variable should be JUST data, so remove the first line
        # Example value:
        """
        [
            ['1', '400', '505', etc],
            ['2', '241', '297', etc],
            etc
        ]
        """
        file = file[1:]

[4.338]

[4.1186]

    def median_incomes(self):
        incomes = {}
        for column in range(len(self.categories)):
            category = self.categories[column]

Replacement in project.py at line 345 [4.35]

B:BD[4.1187] → [4.1187:1588]

        # Get a list of all the areas (states, could later be LGAs)
        # Should end up looking like:
        # ['1', '2', '3', etc]
        areas = []
        for row in range(len(file)):
            # Area code is the first column of each row
            # Example value: '1'
            first_cell = file[row][0]
            # Add that to list of all areas
            areas.append(first_cell)

[4.1187]

[4.1588]

            total_income = 0
            for row in range(len(self.data)):
                total_income += int(self.data[row][column])

Replacement in project.py at line 349 [4.35]

B:BD[4.1589] → [4.1589:1673]

            # Remove this column from the row
            file[row] = file[row][1:]

[4.1589]

[4.1673]

            income = category[1]
            if income not in incomes:
                incomes[income] = 0
            incomes[income] += total_income

Replacement in project.py at line 354 [4.35]

B:BD[4.1674] → [4.1674:1898]

        # Go through ALL the data in the file column-by-column
        for column in range(len(categories_for_file)):
            # Example category: 'C11_Neg_Ni_inc_R1_74'
            category = categories_for_file[column]

[4.1674]

[4.1898]

        return incomes

Replacement in project.py at line 356 [4.35]

B:BD[4.1899] → [4.1899:2466]

            # Example value: 'C11_Neg_Ni_inc_R1_74'
            # Split the category up into parts (separated by underscores)
            category = category.split("_")
            # Example value for category: ['C11', 'Neg', 'Ni', etc]
            # Year is always prefixed, so get everything before first underscore
            # Example value: 'C11' (first item in category)
            year = category[0]
            # Category is everything after the first underscore
            # Example value: 'Neg_Ni_inc_R1_74'
            category = "_".join(category[1:])

[4.1899]

[4.2466]

    def median_rents(self):
        rents = {}
        for column in range(len(self.categories)):
            category = self.categories[column]

Replacement in project.py at line 361 [4.35]

B:BD[4.2467] → [4.2467:2717]

            # Add that category to the list of categories (without year)
            # Example value: 'Neg_Ni_inc_R1_74' (no year included!!)
            # TODO: assert all categories are the same across years
            categories.append(category)

[4.2467]

[4.2717]

            total_rent = 0
            for row in range(len(self.data)):
                total_rent += int(self.data[row][column])

Replacement in project.py at line 365 [4.35]

B:BD[4.2718] → [4.2718:3019]

            # Set up the year if none exists
            # Example year: 'C11'
            if year not in years:
                years[year] = {}
                # Should look like:
                """
                    {
                        'C11': {},
                    }
                """

[4.2718]

[4.3019]

            rent = category[0]
            if rent not in rents:
                rents[rent] = 0
            rents[rent] += total_rent

Replacement in project.py at line 370 [4.35]

B:BD[4.3020] → [4.3020:3579]

            # Now add every row in this column
            for row in range(len(file)):
                # years[year] is a dict mapping states to rows
                # Every key is essentially one row of data in the csv
                # It will end up looking something like:
                """
                {
                    '1': [1, 2, 3, etc],
                    '2': [3, 1, 2, etc],
                    etc
                }
                """
                # Example value: first row (row = 0), area = '1'
                area = areas[row]

[4.3020]

[4.3579]

        return rents

Deletion in project.py at line 372 [4.35]

B:BD[4.3580] → [4.3580:3761]

                # If the area hasn't been seen before
                if area not in years[year]:
                    # Add an empty list
                    years[year][area] = []

Replacement in project.py at line 373 [4.35]

B:BD[4.3762] → [4.3762:3998]

                # This is the very big long row for the current year
                current_cell = file[row][column]
                # Add the current cell to the relevant dataset
                years[year][area].append(current_cell)

[4.3762]

[4.3998]

class Census2011(TimeSeriesBase):
    def year(self):
        return 2011

Deletion in project.py at line 377 [4.35]
B:BD[4.3999] → [4.3999:4030]
```
    return (categories, years)
```

Insertion in project.py at line 378 [4.35]

[4.4031]

class Census2016(TimeSeriesBase):
    def year(self):
        return 2016

Deletion in project.py at line 382 [4.35]

B:BD[4.4032] → [4.4032:4189]

B:BD[4.4189] → [5.5785:5960]

∅:D[5.5960] → [4.4269:4861]

B:BD[4.4269] → [4.4269:4861]

# Simple function that says whether or not to use a category
def should_keep(category):
    # If any of these phrases are in the category, it's not allowed!
    bad_words = [
        "Neg_Ni_inc",
        "4000more",
        "R650more",
        "R_NS",
        "R_Tot",
        "PI_S",
        "or more",
        "Notstated",
    ]
    for bad_word in bad_words:
        if bad_word in category:
            # Bad word! Tell them not to use this category
            return False
    # Made it to the end without any bad words, should be fine
    return True
# Take a valid category and return the mean income & rent
def split_category(category):
    # Example category: '1_149_R1_74'
    parts = category.split("_")
    # Make sure it's a valid category
    if not should_keep(category):
        print(category)
    assert should_keep(category) is True
    # All valid categories have 4 parts
    assert len(parts) == 4

Replacement in project.py at line 383 [4.35]

B:BD[4.4862] → [4.4862:4978]

    # Split up the parts
    min_income = int(parts[0])
    max_income = int(parts[1])
    max_rent = int(parts[3])

[4.4862]

[4.4978]

class Census2021(TimeSeriesBase):
    def year(self):
        return 2021

Deletion in project.py at line 387 [4.35]

B:BD[4.4979] → [4.4979:5232]

    # `min_rent` is special because it has the 'R' prefix
    min_rent = parts[2]
    # Make sure the first character is an 'R'
    assert min_rent[0] == "R"
    # Then we can just convert it into an int like the others
    min_rent = int(min_rent[1:])

Replacement in project.py at line 388 [4.35]

B:BD[4.5233] → [4.5233:6861]

    mean_income = round((min_income + max_income) / 2)
    mean_rent = round((min_rent + max_rent) / 2)
    return (mean_income, mean_rent)
# Second step: split 2-variable categories into table
def reshape_into_table(categories, years):
    # Example value: 'C11'
    for year in years:
        # Example value: '1'
        for area in years[year]:
            # TODO: categories may change between iterations
            # This should look like:
            """
            [
                # This is income of $1-$149
                [
                    # This is rent of $1-$74
                    1535,
                    # This is rent of $75-$99
                    1580,
                    etc
                ],
                # This is income of $150-$299
                [
                    # This is rent of $1-$74
                    4591,
                    # This is rent of $75-$99
                    2188,
                    etc
                ],
                etc
            ]
            """
            new_data = []
            # The last mean income we saw
            last_income = None
            # Make sure there are as many categories as data
            assert len(years[year][area]) == len(categories)
            # Go through each 2-variable category and data pair
            for column in range(len(categories)):
                # Get the relevant category
                category = categories[column]
                # Check if the category is part of our dataset
                if should_keep(category) is True:
                    income, rent = split_category(category)

[4.5233]

[4.6861]

class Census2011_2016_2021(CensusDataset):
    def __init__(self, files):
        census2011 = None
        census2016 = None
        census2021 = None

Replacement in project.py at line 394 [4.35]

B:BD[4.6862] → [4.6862:7059]

                    # Check if we've hit a new income bracket
                    if last_income != income:
                        new_data.append([])
                        last_income = income

[4.6862]

[4.7059]

        for file_index in range(len(files)):
            file = open(files[file_index], "r").read()
            file = tabulate(file)

Replacement in project.py at line 398 [4.35]

B:BD[4.7060] → [4.7060:7382]

                    # Append our rent information to the most recent bracket
                    # The most recent bracket will always be the one we want
                    current_cell = years[year][area][column]
                    current_cell = int(current_cell)
                    new_data[-1].append(current_cell)

[4.7060]

[4.7382]

            # First-time setup requires opening the first file
            if file_index == 0:
                census2011 = [[] for row in range(len(file))]
                census2016 = [[] for row in range(len(file))]
                census2021 = [[] for row in range(len(file))]

Replacement in project.py at line 404 [4.35]

B:BD[4.7383] → [4.7383:7469]

            # Update this from old->new data
            years[year][area] = new_data

[4.7383]

[4.7469]

            # Headings are the first row
            headings = file[0]
            # Make sure to skip the first column (STE_CODE_2021)
            for column in range(1, len(file[0])):
                if should_keep(headings[column]):
                    (year, mean_income, mean_rent) = split_category(headings[column])

Replacement in project.py at line 411 [4.35]

B:BD[4.7470] → [4.7470:7798]

    incomes = []
    rents = []
    for category in categories:
        if should_keep(category):
            (mean_income, mean_rent) = split_category(category)
            if mean_income not in incomes:
                incomes.append(mean_income)
            if mean_rent not in rents:
                rents.append(mean_rent)

[4.7470]

[4.7798]

                    # Add each cell to its corresponding row
                    for row in range(len(file)):
                        if row == 0:
                            cell = (mean_income, mean_rent)
                        else:
                            cell = file[row][column]

Replacement in project.py at line 418 [4.35]

B:BD[4.7799] → [4.7799:7836]

    return ((incomes, rents), years)

[4.7799]

[4.7836]

                        if year == "C11":
                            census2011[row].append(cell)
                        elif year == "C16":
                            census2016[row].append(cell)
                        elif year == "C21":
                            census2021[row].append(cell)
                        else:
                            raise ValueError("Unsupported census year")

Insertion in project.py at line 427 [4.35]

[4.7837]

        # Make sure headings are consistent across years
        assert census2011[0] == census2016[0] == census2021[0]

Replacement in project.py at line 430 [4.35]

B:BD[4.7838] → [6.314:429]

∅:D[6.429] → [7.989:1009]

B:BD[4.7966] → [7.989:1009]

# (categories, years) = merge_files([TIME_SERIES_A])
# (categories, years) = reshape_into_table(categories, years)
# print(categories)

[4.7838]

[7.1009]

        self.census2011 = Census2011(census2011)
        self.census2016 = Census2016(census2016)
        self.census2021 = Census2021(census2021)

Replacement in project.py at line 434 [4.35]

B:BD[7.1010] → [5.5961:6014]

# parts = merge_time_series_files(STATE_TIME_SERIES)

[7.1010]

[2.0]

    def get_year(self, year):
        if year == 2011:
            return self.census2011
        elif year == 2016:
            return self.census2016
        elif year == 2021:
            return self.census2021
        else:
            raise ValueError("Unsupported year")

Replacement in project.py at line 503 [4.35]

∅:D[2.1842] → [8.1945:2089]

B:BD[3.3983] → [8.1945:2089]

print(Census1996().median_incomes())
print(Census1996().median_rents())
print(Census2001().median_incomes())
print(Census2001().median_rents())

[2.1842]

[2.1843]

# print(Census1996().median_incomes())
# print(Census1996().median_rents())
# print(Census2001().median_incomes())
# print(Census2001().median_rents())

Replacement in project.py at line 508 [4.35]

B:BD[2.1844] → [2.1844:1936]

census = Census2001().median_incomes()
for key in census:
    print(f"{key},{census[key]}")

[2.1844]

[2.1936]

# census = Census2001().median_incomes()
# for key in census:
#     print(f"{key},{census[key]}")
# print(find_median(census))

Replacement in project.py at line 514 [4.35]

B:BD[2.1937] → [2.1937:1964]

print(find_median(census))

[2.1937]

[4.7984]

# print(merge_files(STATE_TIME_SERIES)[0])
time_series = Census2011_2016_2021(STATE_TIME_SERIES)
print(time_series.get_year(2011).median_rents())