Source code for pscore_match.match

"""
This module implements several variants of matching: 
one-to-one matching, one-to-many matching, with or without a caliper, 
and without or without replacement. 
Variants of the methods are examined in Austin (2014).


Austin, P. C. (2014), A comparison of 12 algorithms for matching on the 
propensity score. Statistic. Med., 33: 1057-1069.
"""

from __future__ import division
import numpy as np
import scipy
from scipy.stats import binom, hypergeom, gaussian_kde, ttest_ind, ranksums
import pandas as pd
import matplotlib.pyplot as plt
import plotly
import plotly.graph_objs as go

################################################################################
################################# utils ########################################
################################################################################

def set_caliper(caliper_scale, caliper, propensity):
    # Check inputs
    if caliper_scale == None:
        caliper = 0
    if not(0<=caliper<1):
        if caliper_scale == "propensity" and caliper>1:
            raise ValueError('Caliper for "propensity" method must be between 0 and 1')
        elif caliper<0:
            raise ValueError('Caliper cannot be negative')

    # Transform the propensity scores and caliper when caliper_scale is "logit" or None
    if caliper_scale == "logit":
        propensity = np.log(propensity/(1-propensity))
        caliper = caliper*np.std(propensity)
    return caliper
    
    
def recode_groups(groups, propensity):
    # Code groups as 0 and 1
    groups = (groups == groups.unique()[0])
    N = len(groups)
    N1 = groups[groups == 1].index
    N2 = groups[groups == 0].index
    g1 = propensity[groups == 1]
    g2 = propensity[groups == 0]
    # Check if treatment groups got flipped - the smaller should correspond to N1/g1
    if len(N1) > len(N2):
       N1, N2, g1, g2 = N2, N1, g2, g1
    return groups, N1, N2, g1, g2

################################################################################
############################# Base Matching Class ##############################
################################################################################

[docs]class Match(object): """ Parameters ---------- groups : array-like treatment assignments, must be 2 groups propensity : array-like object containing propensity scores for each observation. Propensity and groups should be in the same order (matching indices) """ def __init__(self, groups, propensity): self.groups = pd.Series(groups) self.propensity = pd.Series(propensity) assert self.groups.shape==self.propensity.shape, "Input dimensions dont match" assert all(self.propensity >=0) and all(self.propensity <=1), "Propensity scores must be between 0 and 1" assert len(np.unique(self.groups)==2), "Wrong number of groups" self.nobs = self.groups.shape[0] self.ntreat = np.sum(self.groups == 1) self.ncontrol = np.sum(self.groups == 0)
[docs] def create(self, method='one-to-one', **kwargs): """ Parameters ---------- method : string 'one-to-one' (default) or 'many-to-one' caliper_scale: string "propensity" (default) if caliper is a maximum difference in propensity scores, "logit" if caliper is a maximum SD of logit propensity, or "none" for no caliper caliper : float specifies maximum distance (difference in propensity scores or SD of logit propensity) replace : bool should individuals from the larger group be allowed to match multiple individuals in the smaller group? (default is False) Returns ------- A series containing the individuals in the control group matched to the treatment group. Note that with caliper matching, not every treated individual may have a match. """ if method=='many-to-one': self._match_many(**kwargs) self._match_info() elif method=='one-to-one': self._match_one(**kwargs) self._match_info() else: raise ValueError('Invalid matching method')
def _match_one(self, caliper_scale=None, caliper=0.05, replace=False): """ Implements greedy one-to-one matching on propensity scores. Parameters ---------- caliper_scale: string "propensity" (default) if caliper is a maximum difference in propensity scores, "logit" if caliper is a maximum SD of logit propensity, or "none" for no caliper caliper : float specifies maximum distance (difference in propensity scores or SD of logit propensity) replace : bool should individuals from the larger group be allowed to match multiple individuals in the smaller group? (default is False) """ caliper = set_caliper(caliper_scale, caliper, self.propensity) groups, N1, N2, g1, g2 = recode_groups(self.groups, self.propensity) # Randomly permute the smaller group to get order for matching morder = np.random.permutation(N1) matches = {} for m in morder: dist = abs(g1[m] - g2) if (dist.min() <= caliper) or not caliper: matches[m] = dist.argmin() # Potential problem: check for ties if not replace: g2 = g2.drop(matches[m]) self.matches = matches self.weights = np.zeros(self.nobs) self.freq = np.zeros(self.nobs) mk = list(matches.keys()) mv = list(matches.values()) for i in range(len(matches)): self.freq[mk[i]] += 1 self.weights[mk[i]] += 1 self.freq[mv[i]] += 1 self.weights[mv[i]] += 1 def _match_many(self, many_method="knn", k=1, caliper=0.05, caliper_scale="propensity", replace=True): """ Implements greedy one-to-many matching on propensity scores. Parameters ---------- many_method : string "caliper" (default) to select all matches within a given range, "knn" for k nearest neighbors, k : int (default is 1). If method is "knn", this specifies the k in k nearest neighbors caliper : float specifies maximum distance (difference in propensity scores or SD of logit propensity) caliper_scale: string "propensity" (default) if caliper is a maximum difference in propensity scores, "logit" if caliper is a maximum SD of logit propensity, or "none" for no caliper replace : bool should individuals from the larger group be allowed to match multiple individuals in the smaller group? (default is False) """ if many_method=="caliper": assert caliper_scale is not None, "Choose a caliper" caliper = set_caliper(caliper_scale, caliper, self.propensity) groups, N1, N2, g1, g2 = recode_groups(self.groups, self.propensity) # Randomly permute the smaller group to get order for matching morder = np.random.permutation(N1) matches = {} for m in morder: dist = abs(g1[m] - g2) dist.sort_values(inplace=True) if many_method == "knn": matches[m] = np.array(dist.nsmallest(n=k).index) # PROBLEM: when there are ties in the knn. # Need to randomly select among the observations tied for the farthest acceptable distance else: keep = np.array(dist[dist<=caliper].index) if len(keep): matches[m] = keep else: matches[m] = np.array([dist.argmin()]) if not replace: g2 = g2.drop(matches[m]) self.matches = matches self.weights = np.zeros(self.nobs) self.freq = np.zeros(self.nobs) mk = list(matches.keys()) mv = list(matches.values()) for i in range(len(matches)): self.freq[mk[i]] += 1 self.weights[mk[i]] += 1 self.freq[mv[i]] += 1 self.weights[mv[i]] += 1/len(mv[i]) def _match_info(self): """ Helper function to create match info """ assert self.matches is not None, 'No matches yet!' self.matches = { 'match_pairs' : self.matches, 'treated' : np.unique(list(self.matches.keys())), 'control' : np.unique(list(self.matches.values())) } self.matches['dropped'] = np.setdiff1d(list(range(self.nobs)), np.append(self.matches['treated'], self.matches['control']))
[docs] def plot_balance(self, covariates, test=['t', 'rank'], filename='balance-plot', **kwargs): """ Plot the p-values for covariate balance before and after matching Parameters ---------- matches : Match Match class object with matches already fit covariates : DataFrame Dataframe for with all observations and one covariate per column. test : array-like or str Statistical test to compare treatment and control covariate distributions. Options are 't' for a two sample t-test or 'rank' for Wilcoxon rank sum test filename : str Optional, name of file to save plot in. Default 'balance-plot' kwargs : dict Key word arguments to pass into plotly.offline.plot Returns ------- None Notes ----- Creates a file with given filename """ # check valid test inputs test = list(test) extra = set(test) - set(['t', 'rank']) if len(extra) > 0: raise ValueError('unidentified test was supplied') # use matches to create covariate and tr dataframes covnames = list(covariates.columns) matched_c = whichMatched(self, covariates, show_duplicates=True)[covnames] matched_g = whichMatched(self, self.groups, show_duplicates=True) trace0_t = None trace1_t = None trace0_rank = None trace1_rank = None # run tests, get pvalues if 't' in test: pvalues_before_t = t_test(covariates, self.groups) pvalues_after_t = t_test(matched_c, matched_g) trace0_t = go.Scatter( x=pvalues_before_t, y=covnames, mode='markers', name='t-test p-values before matching', marker=dict( color='blue', size=12, symbol='circle', ) ) trace1_t = go.Scatter( x=pvalues_after_t, y=covnames, mode='markers', name='t-test p-values after matching', marker=dict( color='pink', size=12, symbol='circle', ) ) if 'rank' in test: pvalues_before_rank = rank_test(covariates, self.groups) pvalues_after_rank = rank_test(matched_c, matched_g) trace0_rank = go.Scatter( x=pvalues_before_rank, y=covnames, mode='markers', name='Wilcoxon test p-values before matching', marker=dict( color='blue', size=12, symbol='triangle-up', ) ) trace1_rank = go.Scatter( x=pvalues_after_rank, y=covnames, mode='markers', name='Wilcoxon test p-values after matching', marker=dict( color='pink', size=12, symbol='triangle-up', ) ) # call code to create figure data = [trace0_t, trace1_t, trace0_rank, trace1_rank] layout = go.Layout( title='Balance test p-values, before and after matching', xaxis=dict( showgrid=False, showline=True, linecolor='gray', titlefont=dict( color='gray' ), tickfont=dict( color='gray' ), tickmode='array', tickvals=[0, 0.05, 0.1, 0.5, 1], ticktext=[0, 0.05, 0.1, 0.5, 1], ticks='outside', tickcolor='gray', ), margin=dict( l=140, r=40, b=80, t=80 ), legend=dict( font=dict( size=10 ), orientation='h' ), shapes=[dict( type='line', x0=0.05, x1=0.05, y0=-1, y1=len(covnames), line=dict( color='gray', dash='dot' ), )], width=800, height=600, hovermode='closest' ) fig = go.Figure(data=data, layout=layout) plotly.offline.plot(fig, filename=filename, show_link=False, **kwargs) ################################################################################ ############################ helper funcs ##################################### ################################################################################
[docs]def whichMatched(matches, data, show_duplicates = True): """ Simple function to convert output of Matches to DataFrame of all matched observations Parameters ---------- matches : Match Match class object with matches already fit data : DataFrame Dataframe with unique rows, for which we want to create new matched data. This may be a dataframe of covariates, treatment, outcome, or any combination. show_duplicates : bool Should repeated matches be included as multiple rows? Default is True. If False, then duplicates appear as one row but a column of weights is added. Returns ------- DataFrame containing only the treatment group and matched controls, with the same columns as input data """ if show_duplicates: indices = [] for i in range(len(matches.freq)): j = matches.freq[i] while j>0: indices.append(i) j -= 1 return data.ix[indices] else: dat2 = data.copy() dat2['weights'] = matches.weights dat2['frequency'] = matches.freq keep = dat2['frequency'] > 0 return dat2.loc[keep]
[docs]def rank_test(covariates, groups): """ Wilcoxon rank sum test for the distribution of treatment and control covariates. Parameters ---------- covariates : DataFrame Dataframe with one covariate per column. If matches are with replacement, then duplicates should be included as additional rows. groups : array-like treatment assignments, must be 2 groups Returns ------- A list of p-values, one for each column in covariates """ colnames = list(covariates.columns) J = len(colnames) pvalues = np.zeros(J) for j in range(J): var = covariates[colnames[j]] res = ranksums(var[groups == 1], var[groups == 0]) pvalues[j] = res.pvalue return pvalues
[docs]def t_test(covariates, groups): """ Two sample t test for the distribution of treatment and control covariates Parameters ---------- covariates : DataFrame Dataframe with one covariate per column. If matches are with replacement, then duplicates should be included as additional rows. groups : array-like treatment assignments, must be 2 groups Returns ------- A list of p-values, one for each column in covariates """ colnames = list(covariates.columns) J = len(colnames) pvalues = np.zeros(J) for j in range(J): var = covariates[colnames[j]] res = ttest_ind(var[groups == 1], var[groups == 0]) pvalues[j] = res.pvalue return pvalues