Skip to the content.

This article is the third part of dataset construction. Here I want to add new data into dataset I got on previous steps. This time it will be info students who wasn’t able to make it on time and stayed in primary school longer then others

All the data for this part of dataset construction I got here: https://duo.nl/open_onderwijsdata/databestanden/po/leerlingen-po/bo-sbo/zittenblijven.jsp

3. Students who stayed in primary school longer.

Read .csv file of new data:

import pandas as pd 
data_raw = pd.read_csv("input_data/Students_stayed_longer_2013_2019.csv", error_bad_lines=False, sep=';', encoding = "ISO-8859-1") 

print(data_raw.shape)
print(data_raw.columns)
data_raw.head() 
(6910, 39)
Index(['BRIN_NUMMER', 'VESTIGINGSNUMMER', 'INSTELLINGSNAAM_VESTIGING',
       'POSTCODE_VESTIGING', 'PLAATSNAAM', 'GEMEENTENUMMER', 'GEMEENTENAAM',
       'PROVINCIE', 'SOORT_PO', 'DENOMINATIE_VESTIGING',
       'BEVOEGD_GEZAG_NUMMER', 'LJ8_13', 'ZIT_13', 'ZITPERC_13',
       'ZITPERC_LJ_13', 'LJ8_14', 'ZIT_14', 'ZITPERC_14', 'ZITPERC_LJ_14',
       'LJ8_15', 'ZIT_15', 'ZITPERC_15', 'ZITPERC_LJ_15', 'LJ8_16', 'ZIT_16',
       'ZITPERC_16', 'ZITPERC_LJ_16', 'LJ8_17', 'ZIT_17', 'ZITPERC_17',
       'ZITPERC_LJ_17', 'LJ8_18', 'ZIT_18', 'ZITPERC_18', 'ZITPERC_LJ_18',
       'LJ8_19', 'ZIT_19', 'ZITPERC_19', 'ZITPERC_LJ_19'],
      dtype='object')
BRIN_NUMMER VESTIGINGSNUMMER INSTELLINGSNAAM_VESTIGING POSTCODE_VESTIGING PLAATSNAAM GEMEENTENUMMER GEMEENTENAAM PROVINCIE SOORT_PO DENOMINATIE_VESTIGING ... ZITPERC_17 ZITPERC_LJ_17 LJ8_18 ZIT_18 ZITPERC_18 ZITPERC_LJ_18 LJ8_19 ZIT_19 ZITPERC_19 ZITPERC_LJ_19
0 00AP 0 Wereldwijs 2716PH ZOETERMEER 637 Zoetermeer Zuid-Holland Bo Gereformeerd vrijgemaakt ... ,181818181818182 ,0227272727272727 11 1 ,0909090909090909 ,0113636363636364 8 2 ,25 ,03125
1 00AR 0 BS "De Maasparel" 6107AW STEVENSWEERT 1641 Maasgouw Limburg Bo Rooms-Katholiek ... ,0588235294117647 ,00735294117647059 17 1 ,0588235294117647 ,00735294117647059 15 0 0 0
2 00AV 0 De Morgenster 3201CN SPIJKENISSE 1930 Nissewaard Zuid-Holland Bo Gereformeerd vrijgemaakt ... 0 0 12 1 ,0833333333333333 ,0104166666666667 7 0 0 0
3 00AZ 0 De Stapsteen 2971AR BLESKENSGRAAF CA 1978 Molenlanden Zuid-Holland Bo Openbaar ... ,25 ,03125 14 2 ,142857142857143 ,0178571428571429 11 1 ,0909090909090909 ,0113636363636364
4 00BA 0 OBS De Klimboom 6666EB HETEREN 1734 Overbetuwe Gelderland Bo Openbaar ... 0 0 15 0 0 0 11 0 0 0

5 rows × 39 columns

In this dataset I see that there are a lot of columns like ZIT_YEAR, ZITPERC_YEAR, ZITPERC_LJ_YEAR, LJ8_YEAR, so I would like to get this YEAR part and transform this columns to rows with specific year value and only 3 columns: ZIT, ZITPERC, ZITPERC_LJ and LJ8. It will need a lot of transformations, so let’s start with creating few separate dataset for each year:

def convert_to_numeric(dataframe, column):
    dataframe[column] = dataframe[column].astype(str).str.replace(',','.')
    dataframe[column] = dataframe[column].astype(str).str.replace(r'^.','0.')
    dataframe[column] = pd.to_numeric(dataframe[column])
    
def get_year_dataset(year):
    data = (data_raw.copy())[['BRIN_NUMMER', 'VESTIGINGSNUMMER', 
                              'LJ8_' + year, 'ZIT_' + year,
                              'ZITPERC_' + year]]
    data['SCHOOL_ID'] = data["VESTIGINGSNUMMER"].astype(str) + data["BRIN_NUMMER"] + "_20" + year
    data.rename(columns = {'LJ8_' + year: 'LJ8',
                           'ZIT_' + year: 'ZIT',
                           'ZITPERC_' + year: 'ZITPERC'},
                inplace = True)
    data.drop('VESTIGINGSNUMMER', 1, inplace = True)
    data.drop('BRIN_NUMMER', 1, inplace = True)
    convert_to_numeric(data, 'ZITPERC')
    data['ZITPERC'] = data['ZITPERC'] * 100
    return data


data_2014 = get_year_dataset('14')
data_2015 = get_year_dataset('15')
data_2016 = get_year_dataset('16')
data_2017 = get_year_dataset('17')
data_2018 = get_year_dataset('18')
data_2014.head()
data_2014.dtypes
LJ8            int64
ZIT            int64
ZITPERC      float64
SCHOOL_ID     object
dtype: object

And merge it together:

data_frames = [data_2014, data_2015, data_2016, data_2017, data_2018]
data_second_year = pd.concat(data_frames).fillna(0)
data_second_year.reset_index(drop=True, inplace=True)
print(data_2014.shape)
print(data_2015.shape)
print(data_2016.shape)
print(data_2017.shape)
print(data_2017.shape)
print(data_2018.shape)
print(data_second_year.shape)
data_second_year
(6910, 4)
(6910, 4)
(6910, 4)
(6910, 4)
(6910, 4)
(6910, 4)
(34550, 4)
LJ8 ZIT ZITPERC SCHOOL_ID
0 11 0 0.000000 000AP_2014
1 18 1 5.555556 000AR_2014
2 3 1 33.333333 000AV_2014
3 24 3 12.500000 000AZ_2014
4 12 4 33.333333 000BA_2014
... ... ... ... ...
34545 0 0 0.000000 031ML_2018
34546 0 0 0.000000 031MM_2018
34547 0 0 0.000000 031MN_2018
34548 0 0 0.000000 031MV_2018
34549 0 0 0.000000 031MX_2018

34550 rows × 4 columns

Let’s check missing and unexpected values:

print(data_second_year.dtypes)
LJ8            int64
ZIT            int64
ZITPERC      float64
SCHOOL_ID     object
dtype: object
lj8_zero = data_second_year[data_second_year['LJ8'] == 0]
print(lj8_zero.shape)
lj8_zero.head()
(2282, 4)
LJ8 ZIT ZITPERC SCHOOL_ID
61 0 0 0.0 003AX_2014
69 0 0 0.0 003BW_2014
75 0 0 0.0 003CS_2014
124 0 0 0.0 003HM_2014
172 0 0 0.0 003LC_2014

It looks like we have 2282 values with total amount of students in class 8 is 0. It could be wrong data or mistake, but anyway this rows says us nothing, so we can remove it:

data_second_year.drop(lj8_zero.index, 0, inplace = True)
print(data_second_year.shape)
data_second_year.head()
(32268, 4)
LJ8 ZIT ZITPERC SCHOOL_ID
0 11 0 0.000000 000AP_2014
1 18 1 5.555556 000AR_2014
2 3 1 33.333333 000AV_2014
3 24 3 12.500000 000AZ_2014
4 12 4 33.333333 000BA_2014

So now let’s read dataset from previous step, add our dataset there and write it to down to final file:

data_score = pd.read_csv("score_second_result.csv", error_bad_lines=False, sep=',', encoding = "ISO-8859-1") 

print(data_score.shape)
print(data_score.columns)
data_score.head()
(31300, 33)
Index(['SCHOOL_ID', 'DATUM', 'INSTELLINGSNAAM_VESTIGING', 'POSTCODE_VESTIGING',
       'PLAATSNAAM', 'GEMEENTENUMMER', 'GEMEENTENAAM', 'PROVINCIE', 'SOORT_PO',
       'DENOMINATIE_VESTIGING', 'BEVOEGD_GEZAG_NUMMER', 'EXAMEN',
       'EXAMEN_AANTAL', 'EXAMEN_GEM', 'REKENEN_LAGER1F', 'REKENEN_1F',
       'REKENEN_1S', 'REKENEN_2F', 'LV_LAGER1F', 'LV_1F', 'LV_2F',
       'TV_LAGER1F', 'TV_1F', 'TV_2F', 'VSO', 'PRO', 'VMBO', 'VMBO_HAVO',
       'HAVO', 'HAVO_VWO', 'VWO', 'ADVIES_NIET_MOGELIJK', 'TOTAAL_ADVIES'],
      dtype='object')
SCHOOL_ID DATUM INSTELLINGSNAAM_VESTIGING POSTCODE_VESTIGING PLAATSNAAM GEMEENTENUMMER GEMEENTENAAM PROVINCIE SOORT_PO DENOMINATIE_VESTIGING ... TV_2F VSO PRO VMBO VMBO_HAVO HAVO HAVO_VWO VWO ADVIES_NIET_MOGELIJK TOTAAL_ADVIES
0 000AP_2015 2015 De Schanskorf 2715BT ZOETERMEER 637 Zoetermeer Zuid-Holland Bo Gereformeerd vrijgemaakt ... 0 0 0 1 0 5 1 1 0 11
1 000AR_2015 2015 BS "De Maasparel" 6109AM OHE EN LAAK 1641 Maasgouw Limburg Bo Rooms-Katholiek ... 0 0 0 1 0 7 0 6 0 18
2 000AZ_2015 2015 De Kiezel en de Kei 2971AR BLESKENSGRAAF CA 1927 Molenwaard Zuid-Holland Bo Openbaar ... 0 0 1 3 5 3 2 4 0 22
3 000BA_2015 2015 OBS De Klimboom 6666EB HETEREN 1734 Overbetuwe Gelderland Bo Openbaar ... 0 1 1 1 0 1 4 0 0 12
4 000BB_2015 2015 Obs Letterwies 9944AR NIEUWOLDA 1895 Oldambt Groningen Bo Openbaar ... 0 0 0 7 0 5 0 2 0 16

5 rows × 33 columns

data = pd.merge(data_score, data_second_year,  on='SCHOOL_ID',  how='left').fillna(0)
data.LJ8 = data.LJ8.astype(int)
data.ZIT = data.ZIT.astype(int)
print(data.shape)
print(data.columns)
data.head()
(31300, 36)
Index(['SCHOOL_ID', 'DATUM', 'INSTELLINGSNAAM_VESTIGING', 'POSTCODE_VESTIGING',
       'PLAATSNAAM', 'GEMEENTENUMMER', 'GEMEENTENAAM', 'PROVINCIE', 'SOORT_PO',
       'DENOMINATIE_VESTIGING', 'BEVOEGD_GEZAG_NUMMER', 'EXAMEN',
       'EXAMEN_AANTAL', 'EXAMEN_GEM', 'REKENEN_LAGER1F', 'REKENEN_1F',
       'REKENEN_1S', 'REKENEN_2F', 'LV_LAGER1F', 'LV_1F', 'LV_2F',
       'TV_LAGER1F', 'TV_1F', 'TV_2F', 'VSO', 'PRO', 'VMBO', 'VMBO_HAVO',
       'HAVO', 'HAVO_VWO', 'VWO', 'ADVIES_NIET_MOGELIJK', 'TOTAAL_ADVIES',
       'LJ8', 'ZIT', 'ZITPERC'],
      dtype='object')
SCHOOL_ID DATUM INSTELLINGSNAAM_VESTIGING POSTCODE_VESTIGING PLAATSNAAM GEMEENTENUMMER GEMEENTENAAM PROVINCIE SOORT_PO DENOMINATIE_VESTIGING ... VMBO VMBO_HAVO HAVO HAVO_VWO VWO ADVIES_NIET_MOGELIJK TOTAAL_ADVIES LJ8 ZIT ZITPERC
0 000AP_2015 2015 De Schanskorf 2715BT ZOETERMEER 637 Zoetermeer Zuid-Holland Bo Gereformeerd vrijgemaakt ... 1 0 5 1 1 0 11 13 0 0.000000
1 000AR_2015 2015 BS "De Maasparel" 6109AM OHE EN LAAK 1641 Maasgouw Limburg Bo Rooms-Katholiek ... 1 0 7 0 6 0 18 17 0 0.000000
2 000AZ_2015 2015 De Kiezel en de Kei 2971AR BLESKENSGRAAF CA 1927 Molenwaard Zuid-Holland Bo Openbaar ... 3 5 3 2 4 0 22 19 3 15.789474
3 000BA_2015 2015 OBS De Klimboom 6666EB HETEREN 1734 Overbetuwe Gelderland Bo Openbaar ... 1 0 1 4 0 0 12 24 5 20.833333
4 000BB_2015 2015 Obs Letterwies 9944AR NIEUWOLDA 1895 Oldambt Groningen Bo Openbaar ... 7 0 5 0 2 0 16 15 2 13.333333

5 rows × 36 columns

Looking at the list of columns I can see that we probably won’t need BEVOEGD_GEZAG_NUMMER and ZITPERC columns because we can get the info in this columns from other places in dataset. I also think that we should remove GEMEENTENUMMER columns because we’ve already have GEMEENTENAAM which has the same info. Let’s remove them and then write the result dataset to the output folder:

data.drop('GEMEENTENUMMER', 1, inplace = True)
data.drop('BEVOEGD_GEZAG_NUMMER', 1, inplace = True)
data.drop('ZITPERC', 1, inplace = True)
data.columns
Index(['SCHOOL_ID', 'DATUM', 'INSTELLINGSNAAM_VESTIGING', 'POSTCODE_VESTIGING',
       'PLAATSNAAM', 'GEMEENTENAAM', 'PROVINCIE', 'SOORT_PO',
       'DENOMINATIE_VESTIGING', 'EXAMEN', 'EXAMEN_AANTAL', 'EXAMEN_GEM',
       'REKENEN_LAGER1F', 'REKENEN_1F', 'REKENEN_1S', 'REKENEN_2F',
       'LV_LAGER1F', 'LV_1F', 'LV_2F', 'TV_LAGER1F', 'TV_1F', 'TV_2F', 'VSO',
       'PRO', 'VMBO', 'VMBO_HAVO', 'HAVO', 'HAVO_VWO', 'VWO',
       'ADVIES_NIET_MOGELIJK', 'TOTAAL_ADVIES', 'LJ8', 'ZIT'],
      dtype='object')
data.to_csv(r'output/Score.csv', index = False)