Press "Enter" to skip to content

Feature Leakage, and identifying it with Exploratory data analysis and Machine Learning


library(tidyverse) # Loading some data loan_data 6M", "20. > 6M"), N = c(18232L, 5115L, 1697L, 819L, 364L, 761L, 476L, 245L, 308L, 137L, 210L, 108L, 155L, 89L, 77L, 137L, 52L, 108L, 103L, 39L, 569L, 260L, 233L, 182L, 1597L, 156L, 109L, 817L, 590L, 116L, 817L, 100L, 51L, 62L, 9L, 1L, 3L, 4L, 1L), percent = c(0.780914036064591, 0.219085963935409, 0.674483306836248, 0.325516693163752, 0.323555555555556, 0.676444444444444, 0.660194174757282, 0.339805825242718, 0.692134831460674, 0.307865168539326, 0.660377358490566, 0.339622641509434, 0.635245901639344, 0.364754098360656, 0.35981308411215, 0.64018691588785, 0.325, 0.675, 0.725352112676056, 0.274647887323944, 0.686369119420989, 0.313630880579011, 0.56144578313253, 0.43855421686747, 0.911009697661152, 0.0889903023388477, 0.117710583153348, 0.882289416846652, 0.835694050991501, 0.164305949008499, 0.890948745910578, 0.109051254089422, 0.451327433628319, 0.548672566371681, 1, 0.25, 0.75, 0.8, 0.2), tots = c(23347L, 23347L, 2516L, 2516L, 1125L, 1125L, 721L, 721L, 445L, 445L, 318L, 318L, 244L, 244L, 214L, 214L, 160L, 160L, 142L, 142L, 829L, 829L, 415L, 415L, 1753L, 1753L, 926L, 926L, 706L, 706L, 917L, 917L, 113L, 113L, 9L, 4L, 4L, 5L, 5L), conf_low = c(0.775552136317493, 0.213794081502295, 0.65578046562415, 0.307220804467065, 0.296264735635882, 0.648227521218143, 0.624326658051425, 0.305255642604346, 0.646947176024304, 0.265253980427813, 0.60544358384926, 0.287709357961987, 0.571443652323727, 0.304282016481803, 0.295522603420615, 0.571952527712148, 0.25317409400087, 0.596551368545636, 0.64420157566435, 0.203150823708409, 0.653560936603063, 0.282154345692913, 0.51220524670192, 0.390195953557052, 0.896698056863425, 0.076072772673856, 0.0976559949418072, 0.859767702403072, 0.80626156910148, 0.137713232814479, 0.868959355941994, 0.0896127959455879, 0.357541357583628, 0.452272456810347, 0.663732883120057, 0.00630946320970987, 0.194120449683243, 0.283582063881911, 0.00505076337946806), conf_hi = c(0.786205918497705, 0.224447863682507, 0.692779195532935, 0.34421953437585, 0.351772478781857, 0.703735264364118, 0.694744357395654, 0.375673341948575, 0.734746019572187, 0.353052823975696, 0.712290642038013, 0.39455641615074, 0.695717983518197, 0.428556347676273, 0.428047472287852, 0.704477396579385, 0.403448631454364, 0.74682590599913, 0.796849176291591, 0.35579842433565, 0.717845654307087, 0.346439063396937, 0.609804046442948, 0.48779475329808, 0.923927227326144, 0.103301943136575, 0.140232297596928, 0.902344005058193, 0.862286767185521, 0.19373843089852, 0.910387204054412, 0.131040644058006, 0.547727543189653, 0.642458642416372, 1, 0.805879550316757, 0.99369053679029, 0.994949236620532, 0.716417936118089)), row.names = c(NA, -39L), class = "data.frame") observed_n_per_cat % filter(finalClass == "Success") %>% pull(tots) geom_negloglikelihood = function(logit_prob, dat) { -sum(dgeom(seq_along(dat)-1, prob = plogis(logit_prob), log = T) * dat)
} mle_prob = plogis(optimize(f = geom_negloglikelihood, dat = observed_n_per_cat, lower = -10, upper = 10)$minimum) expected_n_per_cat = sum(observed_n_per_cat) * dgeom(seq_along(observed_n_per_cat)-1, prob = mle_prob) chisq_statistic 

7 Comments

  1. ปั้มไลค์ August 1, 2020

    Like!! I blog quite often and I genuinely thank you for your information. The article has truly peaked my interest.

  2. SMS August 1, 2020

    bookmarked!!, I like your blog!

  3. kyrie 5 August 10, 2020

    I’m typically to blogging and i really appreciate your content. The article has actually peaks my interest. I’m going to bookmark your web site and maintain checking for brand new information.

  4. kyrie 6 shoes August 11, 2020

    Thank you a lot for providing individuals with such a splendid opportunity to check tips from this web site. It is always so excellent and stuffed with a good time for me and my office peers to visit the blog not less than thrice per week to find out the new secrets you will have. And of course, we’re certainly amazed considering the amazing tactics served by you. Some 3 facts in this article are undoubtedly the very best we’ve ever had.

Leave a Reply

Your email address will not be published.