In [213]:
# Import pandas
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
%pylab inline
# Assign spreadsheet filename: file
file = '../data/imd_student_blind.xlsx'

# Load spreadsheet: xl
xl = pd.ExcelFile(file)

# Load a sheet into a DataFrame by index: df
df = xl.parse(0)
Populating the interactive namespace from numpy and matplotlib

Todas as disciplinas
Aprovados: 2766 (57.12%)
Reprovados: 2076 (42.88%)

In [214]:
#print(df['status.disciplina'].value_counts())
df['status.disciplina'].value_counts().plot('bar')
Out[214]:
<matplotlib.axes._subplots.AxesSubplot at 0x1779accce10>

Disciplina 3
Reprovados: 334 (61.74%)
Aprovados: 207 (38.26%)

Disciplina 5
Reprovado: 523 (66.80%)
Aprovados: 260 (33.20%)

In [215]:
df1 = pd.DataFrame(df[df.disciplina_ID == 3]["status.disciplina"].value_counts())
df2 = pd.DataFrame(df[df.disciplina_ID == 5]["status.disciplina"].value_counts())

df1['Key'] = 'Disc3'
df2['Key'] = 'Disc5'

DFF = pd.concat([df1,df2],keys=['Disc3','Disc5'])
DFF.plot(kind='bar', figsize = (14,6), title = "Status dos alunos nas diciplinas 3 e 5")
Out[215]:
<matplotlib.axes._subplots.AxesSubplot at 0x1779823e198>

Como a disciplina 5 tem uma taxa de 25.19% de todas as reprovações dos 2 primeiros semestres então vamos trabalhar com ela

In [216]:
pd_disc5 = pd.DataFrame(df[df.disciplina_ID == 5])
pd_disc5["nota"].plot('hist', figsize = (14,6), title = "Notas dos alunos da disciplina 5")
Out[216]:
<matplotlib.axes._subplots.AxesSubplot at 0x177981108d0>
In [217]:
def tableAnalytics(mydf, idDisc): 
    pd_mydf = pd.DataFrame(mydf[mydf.disciplina_ID == idDisc])
    reprovados, aprovados, anoSemestre, valMin, quantile1, median, quantile3, valMax, mean, variance, standardDeviation, amplitude = list(), list(), list(), list(), list(), list(), list(), list(), list(), list(), list(), list()
    for ano in range(2014, 2017):
        for semestre in range (1, 3):
            pd_aux = pd_mydf[pd_mydf.periodo_disciplina == semestre][pd_mydf.ano_disciplina == ano]
            aprovados.append(str(pd_aux['status.disciplina'].value_counts()["Aprovado"])+" ("+str(round(pd_aux['status.disciplina'].value_counts(True)["Aprovado"]*100, 2))+"%)")
            reprovados.append(str(pd_aux['status.disciplina'].value_counts()["Reprovado"])+" ("+str(round(pd_aux['status.disciplina'].value_counts(True)["Reprovado"]*100, 2))+"%)")
            anoSemestre.append(str(ano)+"."+str(semestre))
            valMin.append(pd_aux['nota'].min())
            quantile1.append(pd_aux['nota'].quantile(q = 0.25))
            median.append(pd_aux['nota'].median())
            quantile3.append(pd_aux['nota'].quantile(q = 0.75))
            valMax.append(pd_aux['nota'].max())
            mean.append(pd_aux['nota'].mean())
            variance.append(pd_aux['nota'].var())
            standardDeviation.append(pd_aux['nota'].std())

    summary = pd.DataFrame({
        '_Aprovados': aprovados,
        '_Reprovados': reprovados,
        'AnoSemestre' : anoSemestre, 
        'ValueMin': valMin, 
        'Quantile1':quantile1, 
        'Median': median, 
        'Quantile3': quantile3, 
        'ValueMax': valMax, 
        'Average': mean, 
        'Variance': variance, 
        'StandardDeviation': standardDeviation})
    return summary
In [218]:
tableAnalytics(df, 5)
Out[218]:
AnoSemestre Average Median Quantile1 Quantile3 StandardDeviation ValueMax ValueMin Variance _Aprovados _Reprovados
0 2014.1 4.311364 4.45 1.25 6.55 3.098778 9.8 0.0 9.602426 20 (45.45%) 24 (54.55%)
1 2014.2 3.495276 3.00 0.80 5.80 2.989475 10.0 0.0 8.936962 46 (36.22%) 81 (63.78%)
2 2015.1 3.630526 3.80 1.50 5.30 2.426366 9.1 0.0 5.887250 33 (34.74%) 62 (65.26%)
3 2015.2 3.815054 3.70 1.00 6.40 2.990180 10.0 0.0 8.941178 83 (44.62%) 103 (55.38%)
4 2016.1 2.197516 1.20 0.40 3.50 2.374762 10.0 0.0 5.639494 29 (18.01%) 132 (81.99%)
5 2016.2 2.595882 1.70 0.50 5.00 2.470784 10.0 0.0 6.104776 49 (28.82%) 121 (71.18%)
In [219]:
#df[df.ano_disciplina == 2016][df.periodo_disciplina == 2].head()
In [220]:
df[df.ano_disciplina == 2016][df.periodo_disciplina == 1][df.disciplina_ID == 5]["nota"].plot.box(by=None, grid=True)
Out[220]:
<matplotlib.axes._subplots.AxesSubplot at 0x1779db0b9e8>
In [221]:
df[df.ano_disciplina == 2016][df.periodo_disciplina == 2][df.disciplina_ID == 3]["nota"].plot.box(by=None, grid=True)
Out[221]:
<matplotlib.axes._subplots.AxesSubplot at 0x1779d49aa90>
In [222]:
from sklearn import linear_model
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
allPeriod = []
person = []  
person2 = []
disciplineID = []
studentsFilter = []
for ano in range(2014, 2017):
    for semestre in range(1, 3):
        studentsFilter = df[df.ano_disciplina == ano][df.periodo_disciplina == semestre][df.disciplina_ID == 5]["a_ID"].unique()
        classFilter = df[df.ano_disciplina == ano][df.periodo_disciplina == semestre]
        for disId in range(0,7):
            listKey = []
            listRelative = []
            for student in studentsFilter:
                if not classFilter[classFilter.a_ID == student][classFilter.disciplina_ID == disId]["nota"].empty:
                    listKey.append(classFilter[classFilter.a_ID == student][classFilter.disciplina_ID == 5]["nota"].values)
                    listRelative.append(classFilter[classFilter.a_ID == student][classFilter.disciplina_ID == disId]["nota"].values)
            disciplineID.append(disId)
            allPeriod.append(str(ano)+"."+str(semestre))
            person.append(pearsonr(listKey, listRelative)[0])
            person2.append((pearsonr(listKey, listRelative)[0]**2)*100)

Foi usado o Coeficiente de correlação de Pearson para verificar a correlação da nota da disciplina5 com as outras que o aluno está cursando no mesmo semestre.
Não tendo uma regularidade nos resultados, não podemos afirmar que exista um correlação linear entre as disciplinas analisadas.
É possível que existam variáveis ocultas que estão interferindo nos valores, como dedicação dos alunos, material didático, habilidades de ensino dos professores e outros.

In [223]:
df_person = pd.DataFrame({
        'anoSemestre': allPeriod,
        'person': person,
        'person²': person2, 
        'disciplineID' : disciplineID})
df_person.sort(['person²'])
Out[223]:
anoSemestre disciplineID person person²
15 2015.1 1 [0.267057499352] [7.13197079602]
9 2014.2 2 [0.28116154] [7.90518115753]
17 2015.1 3 [0.38600875565] [14.9002759439]
14 2015.1 0 [0.387778798555] [15.0372396609]
11 2014.2 4 [0.399901984754] [15.992159741]
0 2014.1 0 [0.468655954707] [21.9638403882]
20 2015.1 6 [0.472822324856] [22.3560950882]
32 2016.1 4 [0.489136010932] [23.9254037191]
25 2015.2 4 [0.493763186376] [24.380208422]
39 2016.2 4 [0.560699375025] [31.4383789154]
28 2016.1 0 [0.586983237993] [34.4549321685]
35 2016.2 0 [0.603025642231] [36.3639925189]
16 2015.1 2 [0.62061531425] [38.5163368282]
27 2015.2 6 [0.624358399265] [38.9823410733]
1 2014.1 1 [0.629617572551] [39.6418287665]
8 2014.2 1 [0.642538969779] [41.2856327685]
13 2014.2 6 [0.663623865002] [44.0396634201]
41 2016.2 6 [0.697620700116] [48.6674641231]
31 2016.1 3 [0.704132245131] [49.5802218634]
34 2016.1 6 [0.716757429658] [51.3741212971]
21 2015.2 0 [0.745531933908] [55.5817864476]
29 2016.1 1 [0.748618973011] [56.0430366752]
7 2014.2 0 [0.764873527407] [58.5031512928]
10 2014.2 3 [0.772043824064] [59.6051666275]
38 2016.2 3 [0.772867683885] [59.7324456794]
24 2015.2 3 [0.848964769561] [72.0741179955]
3 2014.1 3 [0.88716437582] [78.7060629725]
36 2016.2 1 [-0.937185087117] [87.8315887514]
6 2014.1 6 [0.938342894493] [88.0487387645]
19 2015.1 5 [1.0] [100.0]
5 2014.1 5 [1.0] [100.0]
33 2016.1 5 [1.0] [100.0]
26 2015.2 5 [1.0] [100.0]
2 2014.1 2 [1.0] [100.0]
22 2015.2 1 -1 100
12 2014.2 5 [1.0] [100.0]
40 2016.2 5 [1.0] [100.0]
4 2014.1 4 NaN NaN
18 2015.1 4 NaN NaN
23 2015.2 2 NaN NaN
30 2016.1 2 NaN NaN
37 2016.2 2 NaN NaN
In [224]:
from sklearn import linear_model
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
allPeriod = []
person = []  
person2 = []
disciplineID = []
studentsFilter = []
for ano in range(2014, 2016):
    for semestre in range(1, 3):
        studentsFilter = df[df.ano_disciplina == ano][df.periodo_disciplina == semestre][df.disciplina_ID == 5]["a_ID"].unique()
        classFilter = df[df.ano_disciplina == ano][df.periodo_disciplina == semestre]
        for disId in range(2,3):
            listKey = []
            listRelative = []
            for student in studentsFilter:
                if not classFilter[classFilter.a_ID == student][classFilter.disciplina_ID == disId]["nota"].empty:
                    print(student) #estudantes que estão magicamente pagando dis3 e dis5
                    listKey.append(classFilter[classFilter.a_ID == student][classFilter.disciplina_ID == 5]["nota"].values)
                    listRelative.append(classFilter[classFilter.a_ID == student][classFilter.disciplina_ID == disId]["nota"].values)
      
            disciplineID.append(disId)
            allPeriod.append(str(ano)+"."+str(semestre))
            person.append(pearsonr(listKey, listRelative)[0])
            person2.append((pearsonr(listKey, listRelative)[0]**2)*100)
            
df_person = pd.DataFrame({
        'anoSemestre': allPeriod,
        'person': person,
        'person2': person2, 
        'disciplineID' : disciplineID})
#df_person.sort(['person2'])
107
219
7
17
32
124
126
161
135
297
520
In [225]:
#df[df.a_ID == 219]