In [1]:
%reload_ext autotime
import pandas as pd
import plotly.io as pio
import plotly.express as px # Plotting
from tqdm import tqdm
tqdm.pandas()
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.ensemble import RandomForestRegressor
pio.renderers.default = 'notebook'
pd.options.plotting.backend = "plotly"
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", 100)
time: 1.61 s (started: 2025-06-27 20:34:12 +12:00)
In [2]:
df = pd.read_excel("LLM_results_cleaned.xlsx")
df = df[(df.timeLeft == "closed") & (df.goal > 0)]
df.sentiment = df.sentiment.astype(str).str.split(" ").str[0].astype(float)
df.truth = df.truth.astype(str).str.split(" ").str[0].astype(float)
df.urgency = df.urgency.astype(str).str.split(" ").str[0].astype(float)
df["Target_completion"] = df.amountRaised / df.goal * 100
df
Out[2]:
Index | title | pitch | hero | uri | amountRaised | goal | timeLeft | action | actionUri | start | end | n_questions | location | n_donors | description | use_of_funds | whos_involved | moderated | n_updates | updates | _21 | text | condition | ICD10 | ICD | name | gender | age | age_group | ethnicity | urgency | sentiment | truth | notes | smiling | deservingness | attractiveness | use | Target_completion | Charity | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 31 | Helping a person in need | Helping out a person in need that would drop his hat to help someone else | https://static.givealittle.co.nz/assets/hero/ed760e80-7e10-4078-b7a8-b2ec0104419b-320:False | https://givealittle.co.nz/cause/helping-a-person-in-need | 1700.0 | 2000.0 | closed | Read more | https://givealittle.co.nz/cause/helping-a-person-in-need | 29 May 2025 | 8 Jun 2025 | 0 | Blenheim, Marlborough | 54.0 | Tok had a serious injury while running — he tore a muscle, which led to internal bleeding and ev... | medical care where needed and general stuff around the house if needed also and utility's | Created by, and paying to a verified bank account of,\r\nMatthew bergman on behalf of Tok H... | Page Moderated | 3.0 | Big thanks\n7 June 2025\nHey team huge thanks to all that have donated Tok is humbled beyond wor... | NaN | Helping a person in need Helping a person in need Helping out a person in need that would drop h... | Muscle tear leading to internal bleeding and infection | S86.0 | Injury, poisoning and certain other consequences of external causes | Tok | male | 35 | 15-64 | Pacific Islander | 90.0 | 70.0 | 95.0 | Age estimated based on appearance; ethnicity inferred from image; sentiment reflects a mix of gr... | 1 | 95 | 60 | medical expenses | 85.000000 | False |
1 | 56 | Please help Walter 🙏 to get the veterinary care he needs. | Please help us 🙏 Walty needs bloodwork to rule out cancer 😪 | https://static.givealittle.co.nz/assets/hero/a5763829-1835-4780-adc3-b2e500bd5557-320:False | https://givealittle.co.nz/cause/please-help-walter-to-get-the-veterinary-care-he | 380.0 | 500.0 | closed | Read more | https://givealittle.co.nz/cause/please-help-walter-to-get-the-veterinary-care-he | 22 May 2025 | 1 Jun 2025 | 0 | Otago | 14.0 | Walter has been my life for 15 years, since he was a pup. It has always being just the two of us... | The money raised will go into Walts vet costsFull bloods, medication etc at the moment is quoted... | Created by, and paying to a verified bank account of,\r\nKerri Back | Page Moderated | 1.0 | Bloodwork done.\n23 May 2025\nThank you so much for your kind donations.\nWalty has had his bloo... | NaN | Please help Walter 🙏 to get the veterinary care he needs. Please help Walter 🙏 to get the veteri... | Pancreatitis | K85 | 11 | Walter | male | 15 | 15-64 | Unknown | 90.0 | 70.0 | 95.0 | The primary subject is an animal named Walter, not a human. The text mentions pancreatitis as th... | 1 | 90 | 80 | medical expenses | 76.000000 | False |
2 | 59 | Medical Treatment for Atish Lal for Coronary Artery Disease | medical Treatment | https://static.givealittle.co.nz/assets/hero/0a81d0ab-2bfc-4890-8049-b2e3017fea04-320:False | https://givealittle.co.nz/cause/medical-treatment-for-atish-lal-for-coronary | 380.0 | 14900.0 | closed | Read more | https://givealittle.co.nz/cause/medical-treatment-for-atish-lal-for-coronary | 20 May 2025 | 6 Jun 2025 | 0 | Auckland | 6.0 | Dear Family and Friends,\nI hope this message finds you well.\nI am reaching out today with a hu... | Estimated costs for Treatment & Travel:- PTCA treatment package + home stay : $11,000 USD- Reser... | Created by\r\nShaneel Prasad \n\n\n\n\r\n Paying to a verified bank account of\r\nPriya Go... | Page Moderated | NaN | NaN | NaN | Medical Treatment for Atish Lal for Coronary Artery Disease Medical Treatment for Atish Lal for ... | Coronary Artery Disease | I25 | I - Diseases of the circulatory system | Atish Lal | male | 35 | 15-64 | Indian | 90.0 | 70.0 | 95.0 | Age estimated based on typical age range for coronary artery disease; ethnicity inferred from th... | 0 | 85 | 50 | medical expenses | 2.550336 | False |
3 | 74 | Help me to reclaim my freedom. | Help my to reclaim my freedom and contribute to my community. | https://static.givealittle.co.nz/assets/hero/0e91c689-ef0e-4c5c-aa9d-b2e0010f721b-320:False | https://givealittle.co.nz/cause/help-me-to-reclaim-my-freedom | 2999.0 | 6000.0 | closed | Read more | https://givealittle.co.nz/cause/help-me-to-reclaim-my-freedom | 17 May 2025 | 5 Jun 2025 | 0 | Bay of Plenty | 15.0 | Two years after my first Covid infection, I’m still living with debilitating Long Covid, Chronic... | The money will be spend on a mobility scooter that can be folded down and taken in the car so I ... | Created by, and paying to a verified bank account of,\r\nAmy Cole | Page Moderated | NaN | NaN | NaN | Help me to reclaim my freedom. Help me to reclaim my freedom. Help my to reclaim my freedom and ... | Long Covid, Chronic Fatigue/ME, PEM, Acquired Apraxia of Speech | G93.3 (Postviral fatigue syndrome) | Chapter VI: Diseases of the nervous system | Amy | female | 35 | 15-64 | Caucasian | 80.0 | 60.0 | 90.0 | Age estimated based on appearance in the image. Ethnicity assumed based on visual characteristics. | 1 | 90 | 70 | medical expenses | 49.983333 | False |
4 | 75 | Help My Mum Fight Cancer - Every Bit Counts | Rasing funds for my mum's cancer treatment in India - every donation brings us closer to saving ... | https://static.givealittle.co.nz/assets/hero/3b1e6715-fdbd-4e4c-b787-b2e20160c6d0-320:False | https://givealittle.co.nz/cause/help-my-mum-fight-cancer-every-bit-counts | 630.0 | 50000.0 | closed | Read more | https://givealittle.co.nz/cause/help-my-mum-fight-cancer-every-bit-counts | 17 May 2025 | 19 May 2025 | 0 | Otahuhu, Auckland | 26.0 | I'm reaching out with a heavy heart and hope in my hands. My mum, the strongest person I know, h... | Hospital bills, chemotherapy, travel expenses, medications, post-treatment care in India. | Created by, and paying to a verified bank account of,\r\nAlana Al Jebin on behalf of My mot... | Page Moderated | NaN | NaN | NaN | Help My Mum Fight Cancer - Every Bit Counts Help My Mum Fight Cancer - Every Bit Counts Rasing f... | Cancer | C00-C97 | Neoplasms | Not explicitly mentioned | female | Not explicitly mentioned | indeterminate/unknown | Indian | 90.0 | 80.0 | 95.0 | Age and name not explicitly provided in the text. Ethnicity assumed based on the mention of trea... | 0 | 95 | 50 | medical expenses | 1.260000 | False |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
4406 | 11192 | New Zealand MindBody Trust web-based 'whole person' resources for healthworkers and their patien... | The NZ MindBody Trust web-based project to support and educate clinicians and patients in a 'who... | https://static.givealittle.co.nz/assets/hero/158d0e80-2c47-449a-a05e-a603014e786e-320:False | https://givealittle.co.nz/cause/mindbodyresourcesforpatientsandhealthworkers | 6700.0 | 9000.0 | closed | Read more | https://givealittle.co.nz/cause/mindbodyresourcesforpatientsandhealthworkers | 9 May 2016 | 15 Jul 2016 | 0 | Auckland | 36.0 | The NZ MindBody Trust (2004) is concerned with 'whole person' and person-centred approaches to h... | NaN | Created by, and paying to a verified bank account of,_x000D_\n _x000D_\n New Zealand... | Page Moderated | 2.0 | The Give-A-Little Campaign Result\n12 July 2016\nThe MindBody Trustees want to thank you all for... | NaN | New Zealand MindBody Trust web-based 'whole person' resources for healthworkers and their patien... | Chronic Disorders | R53 | Chapter XVIII: Symptoms, Signs and Abnormal Clinical and Laboratory Findings, Not Elsewhere Clas... | Not specified | other | Not specified | indeterminate/unknown | Not specified | 80.0 | 90.0 | 95.0 | The text does not specify a particular individual, so details like name, gender, age, and ethnic... | 0 | 90 | 50 | medical expenses | 74.444444 | True |
4407 | 11194 | Diane's cyst removal to continue my dream job | Large Radial Ganglion Cyst Removal | https://static.givealittle.co.nz/assets/hero/06af4a19-e239-44bb-a5a8-a60400ab4982-320:False | https://givealittle.co.nz/cause/wristoperation | 730.0 | 5000.0 | closed | Read more | https://givealittle.co.nz/cause/wristoperation | 9 May 2016 | 3 Jun 2016 | 0 | Northland | 11.0 | I am in need of a large ganglion cyst removed from my left wrist. It has entwined itself with 2 ... | NaN | Created by, and paying to a verified bank account of,_x000D_\n _x000D_\n Diane Crocker | Page Moderated | NaN | NaN | NaN | Diane's cyst removal to continue my dream job Diane's cyst removal to continue my dream job Larg... | Radial Ganglion Cyst | M75.3 | M00-M99 Diseases of the musculoskeletal system and connective tissue | Diane | female | 45 | 15-64 | Caucasian | 80.0 | 50.0 | 90.0 | Age estimated based on appearance; ethnicity inferred from image. | 0 | 85 | 60 | medical expenses | 14.600000 | False |
4408 | 11202 | A thank you to Waikato NICU for helping so many little babies and their families | Raising funds for Waikato NICU, and a portable cardiac monitor to support the miracle babies. | https://static.givealittle.co.nz/assets/hero/75b48fa1-0752-44b4-8484-a5fb015ecab2-320:False | https://givealittle.co.nz/cause/charlieandoliver | 8912.7 | 35000.0 | closed | Read more | https://givealittle.co.nz/cause/charlieandoliver | 3 May 2016 | 30 Jun 2017 | 0 | Waikato | 100.0 | This give-a-little page aims to raise funds to allow the NICU at Waikato Hospital to purchase a ... | NaN | Created by_x000D_\n _x000D_\n Rebecca Clews_x000D_\n \n\n\n\n\n\n\n_x000D_\n ... | Page Moderated | 4.0 | Quiz Night!\n7 November 2016\nGot no plans on the 17th of November? Come on down to the Speights... | NaN | A thank you to Waikato NICU for helping so many little babies and their families A thank you to ... | Premature birth | P07 | Chapter XVII: Congenital malformations, deformations and chromosomal abnormalities | Charlie and Oliver | male | Newborn (preterm) | 0-14 | Asian | 90.0 | 80.0 | 95.0 | The text mentions premature birth and the need for a portable cardiac monitor for preterm infant... | 1 | 95 | 70 | medical expenses | 25.464857 | False |
4409 | 11203 | Help restore David’s smile after being assaulted | Talented professional musician and entertainer David Shanhun needs your help to restore his teet... | https://static.givealittle.co.nz/assets/hero/ee0d17f1-b6a5-4d41-85e9-a5fa0166b741-320:False | https://givealittle.co.nz/cause/help-restore-davids-teeth-and-smile | 8048.0 | 54500.0 | closed | Read more | https://givealittle.co.nz/cause/help-restore-davids-teeth-and-smile | 3 May 2016 | 31 Jul 2016 | 0 | Auckland | 115.0 | David was assaulted on the 16th of April 2016. He had 2 teeth knocked out completely, and 10 tee... | NaN | Created by_x000D_\n _x000D_\n Fiona Shanhun_x000D_\n \n\n\n\n\n\n\n_x000D_\n ... | Page Moderated | 2.0 | Health/Teeth/Face Update\n24 May 2016\n***Message from David:***\nThank you all so much for your... | NaN | Help restore David’s smile after being assaulted Help restore David’s smile after being assaulte... | Dental Trauma | S02.5 | Injury, poisoning and certain other consequences of external causes | David Shanhun | male | Not specified | 15-64 | Caucasian | 80.0 | 70.0 | 90.0 | Age is not explicitly mentioned in the text; assumed to be within the 15-64 range based on conte... | 0 | 90 | 70 | medical expenses | 14.766972 | False |
4410 | 11212 | TRUS Biopsy Machine for Taranaki | The men of Taranaki need this TRUS Biopsy Machine to diagnose Prostate Cancer. Reduce waiting t... | https://static.givealittle.co.nz/assets/hero/c9169845-63b5-4c1e-bb50-a5fc00e5abd0-320:False | https://givealittle.co.nz/cause/prostatebiopsymachinenewplymouth | 1910.0 | 20000.0 | closed | Read more | https://givealittle.co.nz/cause/prostatebiopsymachinenewplymouth | 29 Apr 2016 | 31 Aug 2016 | 0 | Taranaki | 15.0 | Currently the TRUS (Trans Rectal Ultrasound) machine for diagnosing prostate cancer is only avai... | NaN | Created by, and paying to a verified bank account of,_x000D_\n _x000D_\n Rotary Club... | Page Moderated | NaN | NaN | NaN | TRUS Biopsy Machine for Taranaki TRUS Biopsy Machine for Taranaki The men of Taranaki need this ... | Prostate Cancer | C61 | Neoplasms | Men of Taranaki | male | Not specified | 15-64 | other | 80.0 | 90.0 | 95.0 | The text does not specify a specific individual but refers to 'men of Taranaki'. Age is inferred... | 0 | 90 | 50 | medical expenses | 9.550000 | True |
4411 rows × 41 columns
time: 3.73 s (started: 2025-06-27 20:34:14 +12:00)
In [3]:
df.gender.value_counts().head(20)
Out[3]:
gender female 2258 male 1613 other 540 Name: count, dtype: int64
time: 4.59 ms (started: 2025-06-27 20:34:18 +12:00)
In [4]:
print("\n".join(df[df.gender == "Male and Female"].uri.to_list()))
time: 1.51 ms (started: 2025-06-27 20:34:18 +12:00)
In [5]:
df.gender = df.gender.str.lower().where(df.gender.str.lower().isin(['male', 'female']), 'other')
df.gender.value_counts()
Out[5]:
gender female 2258 male 1613 other 540 Name: count, dtype: int64
time: 7.61 ms (started: 2025-06-27 20:34:18 +12:00)
In [6]:
df.ethnicity.value_counts().head(20)
Out[6]:
ethnicity Caucasian 2541 other 396 Māori 272 South Asian 214 Asian 161 Pacific Islander 144 Not specified 121 Unknown 121 African 57 Filipino 54 Indian 49 East Asian 42 White 39 Mixed 38 South Asian/Indian 35 Filipino/Asian 33 Latin American/Hispanic 29 Fijian 25 Hispanic/Latino 22 Not applicable 18 Name: count, dtype: int64
time: 4.3 ms (started: 2025-06-27 20:34:18 +12:00)
In [7]:
df.ethnicity = df.ethnicity.str.extract(r"(.+?)(?= \(|$)")
df.ethnicity.value_counts().head(20)
Out[7]:
ethnicity Caucasian 2541 other 396 Māori 272 South Asian 214 Asian 161 Pacific Islander 144 Not specified 121 Unknown 121 African 57 Filipino 54 Indian 49 East Asian 42 White 39 Mixed 38 South Asian/Indian 35 Filipino/Asian 33 Latin American/Hispanic 29 Fijian 25 Hispanic/Latino 22 Not applicable 18 Name: count, dtype: int64
time: 11.5 ms (started: 2025-06-27 20:34:18 +12:00)
In [8]:
df.ethnicity = df.ethnicity.replace({
"European/Caucasian": "Caucasian",
"European/White": "Caucasian",
"European/White": "Caucasian",
"European/New Zealand": "Caucasian",
"Maori": "Māori"
})
top_20 = df.ethnicity.value_counts().head(20)
display(top_20)
df.ethnicity = df.ethnicity.where(df.ethnicity.isin(top_20.index), "other")
ethnicity Caucasian 2541 other 396 Māori 272 South Asian 214 Asian 161 Pacific Islander 144 Not specified 121 Unknown 121 African 57 Filipino 54 Indian 49 East Asian 42 White 39 Mixed 38 South Asian/Indian 35 Filipino/Asian 33 Latin American/Hispanic 29 Fijian 25 Hispanic/Latino 22 Not applicable 18 Name: count, dtype: int64
time: 8.74 ms (started: 2025-06-27 20:34:18 +12:00)
In [9]:
df.condition.str.lower().value_counts().head(20)
Out[9]:
condition cancer 192 gender dysphoria 135 breast cancer 135 stroke 75 multiple sclerosis 65 endometriosis 45 bowel cancer 44 type 1 diabetes 39 melanoma 38 obesity 37 none specified 36 cerebral palsy 35 terminal cancer 33 infertility 32 mental health issues 31 covid-19 27 lung cancer 27 coronary artery disease 27 hearing loss 26 brain tumor 26 Name: count, dtype: int64
time: 8.09 ms (started: 2025-06-27 20:34:18 +12:00)
In [10]:
df.sentiment.value_counts()
Out[10]:
sentiment 70.0 1717 80.0 603 50.0 524 60.0 493 90.0 422 30.0 258 85.0 235 20.0 64 95.0 35 40.0 28 75.0 26 10.0 5 100.0 1 Name: count, dtype: int64
time: 5.93 ms (started: 2025-06-27 20:34:18 +12:00)
In [11]:
df.use.str.lower().value_counts().head(20)
Out[11]:
use medical expenses 3287 travel expenses 678 experimental therapies 177 lost wages 141 funeral expenses 19 funeral costs 10 medical expenses, travel expenses 5 other 4 educational tools 3 educational expenses 3 travel expenses, lost wages 3 living costs 2 legal expenses 2 financial expenses 2 living expenses 2 operational costs 2 household expenses 2 educational campaigns 2 celebration 1 renovation supplies 1 Name: count, dtype: int64
time: 6.93 ms (started: 2025-06-27 20:34:18 +12:00)
In [12]:
df.age_group = df.age_group.str.extract(r"(.+?)(?= \(|$)")
df.age_group = df.age_group.str.lower().where(df.age_group.str.lower().isin(['0-14', '15-64', '65+']), 'indeterminate/unknown')
df.age_group.value_counts()
Out[12]:
age_group 15-64 2809 0-14 649 65+ 558 indeterminate/unknown 395 Name: count, dtype: int64
time: 17.1 ms (started: 2025-06-27 20:34:18 +12:00)
In [13]:
df.gender.value_counts()
Out[13]:
gender female 2258 male 1613 other 540 Name: count, dtype: int64
time: 5.2 ms (started: 2025-06-27 20:34:18 +12:00)
In [14]:
df["Charity"] = df.whos_involved.str.contains(r"(Charity)", regex=False)
time: 2.96 ms (started: 2025-06-27 20:34:18 +12:00)
In [15]:
df.to_excel("LLM_results_cleaned.xlsx", index=False)
time: 5.88 s (started: 2025-06-27 20:34:18 +12:00)
In [16]:
df["met_target"] = (df.amountRaised >= df.goal).astype(int)
time: 1.95 ms (started: 2025-06-27 20:34:24 +12:00)
In [17]:
def parse(age):
try:
return float(age)
except ValueError:
return None
df["parsed_age"] = df.age.apply(parse)
df.parsed_age.value_counts().head(20)
Out[17]:
parsed_age 35.0 371 30.0 247 25.0 189 45.0 186 60.0 136 50.0 128 55.0 117 65.0 111 40.0 95 28.0 79 20.0 72 38.0 61 23.0 45 17.0 42 18.0 41 8.0 40 22.0 39 13.0 39 10.0 37 70.0 36 Name: count, dtype: int64
time: 10.8 ms (started: 2025-06-27 20:34:24 +12:00)
In [18]:
df.met_target.value_counts()
Out[18]:
met_target 0 3785 1 626 Name: count, dtype: int64
time: 4.99 ms (started: 2025-06-27 20:34:24 +12:00)
In [19]:
df.deservingness = df.deservingness.apply(parse).astype(float)
df.deservingness.describe()
Out[19]:
count 4410.000000 mean 91.346939 std 4.647941 min 50.000000 25% 90.000000 50% 90.000000 75% 95.000000 max 100.000000 Name: deservingness, dtype: float64
time: 11.8 ms (started: 2025-06-27 20:34:24 +12:00)
In [20]:
import pandas as pd
import numpy as np
from scipy.stats import f_oneway, chi2_contingency, pearsonr
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
def analyze_feature_influence(df, target_col='met_target'):
results = {}
target = df[target_col]
is_target_numeric = np.issubdtype(target.dtype, np.number)
for col in df.columns:
if col == target_col:
continue
feature = df[col]
if feature.isnull().any() or target.isnull().any():
continue # skip missing for simplicity
if is_target_numeric:
# Target is numeric
if np.issubdtype(feature.dtype, np.number):
# Pearson correlation
corr, p_value = pearsonr(feature, target)
results[col] = {
'test': 'pearsonr',
'stat': corr,
'p_value': p_value
}
else:
# ANOVA (feature is categorical)
groups = [target[feature == val] for val in feature.unique()]
stat, p_value = f_oneway(*groups)
results[col] = {
'test': 'anova',
'stat': stat,
'p_value': p_value
}
else:
# Target is categorical
if not np.issubdtype(feature.dtype, np.number):
# Chi-squared test
contingency = pd.crosstab(df[col], target)
stat, p, _, _ = chi2_contingency(contingency)
results[col] = {
'test': 'chi2',
'stat': stat,
'p_value': p
}
return pd.DataFrame(results).T.sort_values('p_value')
# Usage:
# df = pd.read_csv('your_data.csv')
df.met_target = df.met_target.astype(bool)
analyze_feature_influence(df, target_col='met_target')
Out[20]:
test | stat | p_value | |
---|---|---|---|
age_group | chi2 | 36.113648 | 0.0 |
ethnicity | chi2 | 46.450561 | 0.000428 |
end | chi2 | 2164.655553 | 0.254419 |
gender | chi2 | 2.247197 | 0.325108 |
start | chi2 | 2401.026199 | 0.382451 |
whos_involved | chi2 | 4353.107405 | 0.411837 |
title | chi2 | 4391.565606 | 0.469235 |
pitch | chi2 | 4402.788284 | 0.489563 |
text | chi2 | 4411.0 | 0.492921 |
hero | chi2 | 4411.0 | 0.492921 |
uri | chi2 | 4411.0 | 0.492921 |
actionUri | chi2 | 4411.0 | 0.492921 |
notes | chi2 | 2968.234961 | 0.607771 |
Charity | chi2 | 0.170503 | 0.679666 |
action | chi2 | 0.0 | 1.0 |
timeLeft | chi2 | 0.0 | 1.0 |
moderated | chi2 | 0.0 | 1.0 |
time: 644 ms (started: 2025-06-27 20:34:24 +12:00)
In [21]:
df.smiling = df.smiling.astype(bool)
time: 729 μs (started: 2025-06-27 20:34:25 +12:00)
In [22]:
formula = "met_target ~ sentiment + smiling + gender + truth + urgency + parsed_age"
logit = smf.glm(formula=formula, data=df, family=sm.families.Binomial(link=sm.families.links.Logit())).fit()
display(logit.summary())
display(f"AIC (Logit): {logit.aic:.2f}")
probit = smf.glm(formula=formula, data=df, family=sm.families.Binomial(link=sm.families.links.Probit())).fit()
display(probit.summary())
display(f"AIC (Probit): {probit.aic:.2f}")
Dep. Variable: | ['met_target[False]', 'met_target[True]'] | No. Observations: | 3144 |
---|---|---|---|
Model: | GLM | Df Residuals: | 3136 |
Model Family: | Binomial | Df Model: | 7 |
Link Function: | Logit | Scale: | 1.0000 |
Method: | IRLS | Log-Likelihood: | -1215.6 |
Date: | Fri, 27 Jun 2025 | Deviance: | 2431.2 |
Time: | 20:34:25 | Pearson chi2: | 3.15e+03 |
No. Iterations: | 5 | Pseudo R-squ. (CS): | 0.03019 |
Covariance Type: | nonrobust |
coef | std err | z | P>|z| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
Intercept | 5.7990 | 1.485 | 3.906 | 0.000 | 2.889 | 8.709 |
smiling[T.True] | -0.3097 | 0.115 | -2.694 | 0.007 | -0.535 | -0.084 |
gender[T.male] | -0.1971 | 0.110 | -1.794 | 0.073 | -0.412 | 0.018 |
gender[T.other] | 0.7005 | 0.315 | 2.227 | 0.026 | 0.084 | 1.317 |
sentiment | -0.0203 | 0.005 | -4.357 | 0.000 | -0.029 | -0.011 |
truth | -0.0359 | 0.018 | -2.010 | 0.044 | -0.071 | -0.001 |
urgency | 0.0060 | 0.007 | 0.910 | 0.363 | -0.007 | 0.019 |
parsed_age | 0.0139 | 0.003 | 4.522 | 0.000 | 0.008 | 0.020 |
'AIC (Logit): 2447.23'
Dep. Variable: | ['met_target[False]', 'met_target[True]'] | No. Observations: | 3144 |
---|---|---|---|
Model: | GLM | Df Residuals: | 3136 |
Model Family: | Binomial | Df Model: | 7 |
Link Function: | Probit | Scale: | 1.0000 |
Method: | IRLS | Log-Likelihood: | -1215.8 |
Date: | Fri, 27 Jun 2025 | Deviance: | 2431.6 |
Time: | 20:34:25 | Pearson chi2: | 3.16e+03 |
No. Iterations: | 6 | Pseudo R-squ. (CS): | 0.03006 |
Covariance Type: | nonrobust |
coef | std err | z | P>|z| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
Intercept | 3.0633 | 0.771 | 3.971 | 0.000 | 1.551 | 4.575 |
smiling[T.True] | -0.1667 | 0.061 | -2.711 | 0.007 | -0.287 | -0.046 |
gender[T.male] | -0.1131 | 0.060 | -1.890 | 0.059 | -0.230 | 0.004 |
gender[T.other] | 0.3691 | 0.161 | 2.297 | 0.022 | 0.054 | 0.684 |
sentiment | -0.0104 | 0.002 | -4.293 | 0.000 | -0.015 | -0.006 |
truth | -0.0187 | 0.009 | -2.032 | 0.042 | -0.037 | -0.001 |
urgency | 0.0038 | 0.004 | 1.045 | 0.296 | -0.003 | 0.011 |
parsed_age | 0.0078 | 0.002 | 4.687 | 0.000 | 0.005 | 0.011 |
'AIC (Probit): 2447.64'
time: 106 ms (started: 2025-06-27 20:34:25 +12:00)
In [23]:
df.met_target = df.met_target.astype(int)
smf.ols(formula="Target_completion ~ sentiment + smiling + gender + truth + urgency + parsed_age", data=df).fit().summary()
Out[23]:
Dep. Variable: | Target_completion | R-squared: | 0.001 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | -0.001 |
Method: | Least Squares | F-statistic: | 0.5881 |
Date: | Fri, 27 Jun 2025 | Prob (F-statistic): | 0.766 |
Time: | 20:34:25 | Log-Likelihood: | -25776. |
No. Observations: | 3144 | AIC: | 5.157e+04 |
Df Residuals: | 3136 | BIC: | 5.162e+04 |
Df Model: | 7 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
Intercept | -136.3356 | 387.947 | -0.351 | 0.725 | -896.991 | 624.320 |
smiling[T.True] | 17.3750 | 33.570 | 0.518 | 0.605 | -48.446 | 83.196 |
gender[T.male] | 53.2272 | 33.259 | 1.600 | 0.110 | -11.985 | 118.440 |
gender[T.other] | -25.6309 | 80.636 | -0.318 | 0.751 | -183.736 | 132.474 |
sentiment | 0.6093 | 1.214 | 0.502 | 0.616 | -1.771 | 2.990 |
truth | 0.3478 | 4.473 | 0.078 | 0.938 | -8.422 | 9.117 |
urgency | 1.4226 | 2.067 | 0.688 | 0.491 | -2.629 | 5.474 |
parsed_age | -0.5585 | 0.901 | -0.620 | 0.536 | -2.325 | 1.209 |
Omnibus: | 9405.879 | Durbin-Watson: | 2.003 |
---|---|---|---|
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 530902384.962 |
Skew: | 42.477 | Prob(JB): | 0.00 |
Kurtosis: | 2014.336 | Cond. No. | 3.63e+03 |
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 3.63e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
time: 46.2 ms (started: 2025-06-27 20:34:25 +12:00)
In [24]:
smf.ols(formula=formula, data=df).fit().summary()
Out[24]:
Dep. Variable: | met_target | R-squared: | 0.029 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.026 |
Method: | Least Squares | F-statistic: | 13.19 |
Date: | Fri, 27 Jun 2025 | Prob (F-statistic): | 7.41e-17 |
Time: | 20:34:25 | Log-Likelihood: | -1072.2 |
No. Observations: | 3144 | AIC: | 2160. |
Df Residuals: | 3136 | BIC: | 2209. |
Df Model: | 7 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
Intercept | -0.2041 | 0.150 | -1.360 | 0.174 | -0.498 | 0.090 |
smiling[T.True] | 0.0358 | 0.013 | 2.760 | 0.006 | 0.010 | 0.061 |
gender[T.male] | 0.0241 | 0.013 | 1.872 | 0.061 | -0.001 | 0.049 |
gender[T.other] | -0.0751 | 0.031 | -2.409 | 0.016 | -0.136 | -0.014 |
sentiment | 0.0020 | 0.000 | 4.322 | 0.000 | 0.001 | 0.003 |
truth | 0.0033 | 0.002 | 1.919 | 0.055 | -7.21e-05 | 0.007 |
urgency | -0.0008 | 0.001 | -1.001 | 0.317 | -0.002 | 0.001 |
parsed_age | -0.0016 | 0.000 | -4.564 | 0.000 | -0.002 | -0.001 |
Omnibus: | 1102.280 | Durbin-Watson: | 2.006 |
---|---|---|---|
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 2752.352 |
Skew: | 2.002 | Prob(JB): | 0.00 |
Kurtosis: | 5.230 | Cond. No. | 3.63e+03 |
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 3.63e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
time: 45.8 ms (started: 2025-06-27 20:34:25 +12:00)
In [25]:
model = RandomForestRegressor()
cols = "sentiment + smiling + gender + truth + urgency + ethnicity".split(" + ")
X = pd.get_dummies(df[cols])
y = df["met_target"]
model.fit(X, y)
Out[25]:
RandomForestRegressor()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Parameters
n_estimators | 100 | |
criterion | 'squared_error' | |
max_depth | None | |
min_samples_split | 2 | |
min_samples_leaf | 1 | |
min_weight_fraction_leaf | 0.0 | |
max_features | 1.0 | |
max_leaf_nodes | None | |
min_impurity_decrease | 0.0 | |
bootstrap | True | |
oob_score | False | |
n_jobs | None | |
random_state | None | |
verbose | 0 | |
warm_start | False | |
ccp_alpha | 0.0 | |
max_samples | None | |
monotonic_cst | None |
time: 512 ms (started: 2025-06-27 20:34:25 +12:00)
In [26]:
list(zip(cols, model.feature_importances_))
Out[26]:
[('sentiment', np.float64(0.21365808843272313)), ('smiling', np.float64(0.10368107301614912)), ('gender', np.float64(0.10246242527274708)), ('truth', np.float64(0.2056862734817596)), ('urgency', np.float64(0.0559171758768012)), ('ethnicity', np.float64(0.05393489406535803))]
time: 11.9 ms (started: 2025-06-27 20:34:25 +12:00)