import numpy as np
import pandas as pd

import pymc as pm
import xarray as xr

from pymc_marketing.clv import utils
from pymc_marketing.clv import ParetoNBDModel
from pymc_extras.prior import Prior

from pymc_marketing.clv import utils, plotting
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
import pytensor

#set flag to fix open issue
pytensor.config.cxx = '/usr/bin/clang++'

Cree un conjunto de datos simple para pruebas:

d = [
    [1, "2015-01-01", 1],
    [1, "2015-02-06", 2],
    [2, "2015-01-01", 2],
    [3, "2015-01-01", 3],
    [3, "2015-01-02", 1],
    [3, "2015-01-05", 5],
    [4, "2015-01-16", 6],
    [4, "2015-02-02", 3],
    [4, "2015-02-05", 3],
    [4, "2015-02-05", 2],
    [5, "2015-01-16", 3],
    [5, "2015-01-17", 1],
    [5, "2015-01-18", 8],
    [6, "2015-02-02", 5],
]
test_data = pd.DataFrame(d, columns=["id", "date", "monetary_value"])

Nota: el cliente 4 realizó dos compras el 2015-02-05.

_find_first_transactions marca la primera compra que ha realizado cada cliente, la cual debe ser excluida para el modelado. Se llama internamente por rfm_summary.

utils._find_first_transactions(
    transactions=test_data, 
    customer_id_col = "id", 
    datetime_col = "date",
    #monetary_value_col = "monetary_value", 
    #datetime_format = "%Y%m%d",
).reindex()
id date first
0 1 2015-01-01 True
1 1 2015-02-06 False
2 2 2015-01-01 True
3 3 2015-01-01 True
4 3 2015-01-02 False
5 3 2015-01-05 False
6 4 2015-01-16 True
7 4 2015-02-02 False
8 4 2015-02-05 False
10 5 2015-01-16 True
11 5 2015-01-17 False
12 5 2015-01-18 False
13 6 2015-02-02 True

Observe cómo el 9 falta en el índice del dataframe. Múltiples transacciones en el mismo período de tiempo se tratan como una única compra, por lo que los índices de esas transacciones adicionales se omiten.

rfm_summary es el paso principal de preprocesamiento de datos para el modelado de CLV en el dominio continuo y no contractual:

rfm_df = utils.rfm_summary(
    test_data, 
    customer_id_col = "id", 
    datetime_col = "date", 
    monetary_value_col = "monetary_value",
    observation_period_end = "2015-02-06",
    datetime_format = "%Y-%m-%d",
    time_unit = "W",
    include_first_transaction=True,
)

rfm_df.head()
customer_id frequency recency monetary_value
0 1 2.0 0.0 1.5
1 2 1.0 5.0 2.0
2 3 2.0 4.0 4.5
3 4 2.0 0.0 7.0
4 5 1.0 3.0 12.0

Para los ajustes de MAP y modelos de covariables, rfm_train_test_split se puede utilizar para evaluar modelos en datos no vistos. También es útil para identificar el impacto de un evento basado en el tiempo, como una campaña de marketing.

train_test = utils.rfm_train_test_split(
    test_data, 
    customer_id_col = "id", 
    datetime_col = "date", 
    train_period_end = "2015-02-01",
    monetary_value_col = "monetary_value",
)

train_test.head()
customer_id frequency recency T monetary_value test_frequency test_monetary_value test_T
0 1 0.0 0.0 31.0 0.0 1.0 2.0 5.0
1 2 0.0 0.0 31.0 0.0 0.0 0.0 5.0
2 3 2.0 4.0 31.0 3.0 0.0 0.0 5.0
3 4 0.0 0.0 16.0 0.0 2.0 4.0 5.0
4 5 2.0 2.0 16.0 4.5 0.0 0.0 5.0

rfm_segments asignará a los clientes a segmentos basados en su recencia, frecuencia y valor monetario. Utiliza un enfoque de puntuación RFM basado en cuartiles que es muy eficiente en términos computacionales, pero definir segmentos personalizados es un ejercicio bastante subjetivo. El dataframe devuelto tampoco puede ser utilizado para modelar porque no anula las transacciones iniciales.

segments = utils.rfm_segments(
    test_data, 
    customer_id_col = "id", 
    datetime_col = "date", 
    monetary_value_col = "monetary_value",
    observation_period_end = "2015-02-06",
    datetime_format = "%Y-%m-%d",
    time_unit = "W",
)
/Users/coltallen/Projects/pymc-marketing/pymc_marketing/clv/utils.py:702: UserWarning: RFM score will not exceed 2 for f_quartile. Specify a custom segment_config
  warnings.warn(

Trazado#

expected_cumulative_transactions y todas las demás funciones de trazado requieren un modelo ajustado. Pruebe con tanto MAP como con los posteriores completos:

url_cdnow = "https://raw.githubusercontent.com/pymc-labs/pymc-marketing/main/data/cdnow_transactions.csv"
raw_trans = pd.read_csv(url_cdnow)

rfm_data = utils.rfm_summary(
    raw_trans, 
    customer_id_col = "id", 
    datetime_col = "date", 
    datetime_format = "%Y%m%d",
    time_unit = "W",
    observation_period_end = "19970930",
    #time_scaler = 7,
)

rfm_train_test = utils.rfm_train_test_split(
    raw_trans, 
    customer_id_col = "id", 
    datetime_col = "date", 
    train_period_end = "19970701",
    datetime_format = "%Y%m%d",
    time_unit = "W",
    test_period_end = "19970930",
)

t=rfm_data["T"].max().astype(int)
t_start_eval = rfm_train_test["T"].max()

pnbd_split = ParetoNBDModel(data=rfm_data)
pnbd_split.fit()

arviz.InferenceData
    • <xarray.Dataset> Size: 48B
      Dimensions:  (chain: 1, draw: 1)
      Coordinates:
        * chain    (chain) int64 8B 0
        * draw     (draw) int64 8B 0
      Data variables:
          alpha    (chain, draw) float64 8B 14.46
          beta     (chain, draw) float64 8B 10.48
          r        (chain, draw) float64 8B 0.6338
          s        (chain, draw) float64 8B 0.4882
      Attributes:
          created_at:                 2025-09-17T17:06:17.639985+00:00
          arviz_version:              0.22.0
          inference_library:          pymc
          inference_library_version:  5.25.1

    • <xarray.Dataset> Size: 57kB
      Dimensions:            (customer_id: 2357, obs_var: 2)
      Coordinates:
        * customer_id        (customer_id) int64 19kB 1 2 3 4 ... 2354 2355 2356 2357
        * obs_var            (obs_var) <U9 72B 'recency' 'frequency'
      Data variables:
          recency_frequency  (customer_id, obs_var) float64 38kB 30.0 2.0 ... 0.0 0.0
      Attributes:
          created_at:                 2025-09-17T17:06:17.644276+00:00
          arviz_version:              0.22.0
          inference_library:          pymc
          inference_library_version:  5.25.1

    • <xarray.Dataset> Size: 94kB
      Dimensions:      (index: 2357)
      Coordinates:
        * index        (index) int64 19kB 0 1 2 3 4 5 ... 2352 2353 2354 2355 2356
      Data variables:
          customer_id  (index) int64 19kB 1 2 3 4 5 6 ... 2353 2354 2355 2356 2357
          frequency    (index) float64 19kB 2.0 1.0 0.0 0.0 0.0 ... 5.0 0.0 4.0 0.0
          recency      (index) float64 19kB 30.0 2.0 0.0 0.0 0.0 ... 24.0 0.0 26.0 0.0
          T            (index) float64 19kB 39.0 39.0 39.0 39.0 ... 27.0 27.0 27.0

Compras Acumulativas Esperadas#

df_cum = utils._expected_cumulative_transactions(
    model=pnbd_split,
    transactions=raw_trans,
    customer_id_col="id",
    datetime_col="date",
    t=t,
    datetime_format="%Y%m%d",
    time_unit="W",
    set_index_date=True,
)

df_cum.head()
actual predicted
1996-12-30/1997-01-05 0 4.286469
1997-01-06/1997-01-12 3 15.595394
1997-01-13/1997-01-19 17 33.947389
1997-01-20/1997-01-26 44 59.853467
1997-01-27/1997-02-02 67 93.519782

Si se realiza una división entre entrenamiento/prueba, o se estudian intervenciones temporales como campañas de marketing, plot_expected_purchases_over_time se puede utilizar para una interpretación visual fácil.

plotting.plot_expected_purchases_over_time(
    model=pnbd_split,
    purchase_history=raw_trans,
    customer_id_col="id",
    datetime_col="date",
    t=t,
    t_start_eval = t_start_eval,
    datetime_format="%Y%m%d",
    time_unit="W",
    plot_cumulative=True,
)
<Axes: title={'center': 'Tracking Cumulative Transactions'}, xlabel='Time Periods', ylabel='Purchases'>
../../../_images/5cbfe1e1f75f3e414393a0c99d6c02c58d921b67243fe0ef74414a967d448d57.png
plotting.plot_expected_purchases_ppc(pnbd_split, ppc='prior');
Sampling: [alpha, beta, r, recency_frequency, s]
../../../_images/91c5b54ec6d1f142f1de2e1cd16020d169612e242141b652a981f064ce8d93e4.png
from matplotlib import pyplot as plt

fig = plt.figure()

fig.add_axes(plotting.plot_expected_purchases_ppc(pnbd_split, ppc='posterior'))

fig.savefig("save_test.png");
Sampling: [recency_frequency]

../../../_images/1f8e85e558494a0aeb207d79d40b4778fac4fb9412bc8efbed3abeb0057acd6d.png

Ahora ajuste un modelo muestreando de las distribuciones posteriores:

pnbd = ParetoNBDModel(data=rfm_data)

pnbd.build_model()
with pnbd.model:
    pnbd.idata = pm.sample(
        step=pm.DEMetropolisZ(),
        tune=2500,
        draws=3000,
        idata_kwargs={"log_likelihood": True},
    )
pnbd.fit_summary()
mean sd hdi_3% hdi_97% mcse_mean mcse_sd ess_bulk ess_tail r_hat
alpha 14.596 1.114 12.445 16.640 0.042 0.030 712.0 1280.0 1.01
beta 12.043 3.800 5.029 18.742 0.127 0.090 862.0 1294.0 1.00
r 0.640 0.054 0.545 0.742 0.002 0.001 894.0 1132.0 1.01
s 0.529 0.105 0.340 0.729 0.004 0.003 842.0 1553.0 1.01
# thin_fit_result is not supported with external sampler fits in `ParetoNBDModel`!
#pnbd.idata = pnbd.thin_fit_result(keep_every=3)

pnbd.idata = pnbd.idata.isel(draw=slice(None, None, 3)).copy()
pnbd.idata
arviz.InferenceData
    • <xarray.Dataset> Size: 136kB
      Dimensions:  (chain: 4, draw: 1000)
      Coordinates:
        * chain    (chain) int64 32B 0 1 2 3
        * draw     (draw) int64 8kB 0 3 6 9 12 15 18 ... 2982 2985 2988 2991 2994 2997
      Data variables:
          alpha    (chain, draw) float64 32kB 14.46 12.74 12.74 ... 13.8 13.8 13.8
          beta     (chain, draw) float64 32kB 10.87 14.3 14.3 ... 13.58 13.58 13.58
          r        (chain, draw) float64 32kB 0.6681 0.56 0.56 ... 0.5876 0.5876
          s        (chain, draw) float64 32kB 0.5164 0.5783 0.5783 ... 0.5571 0.5571
      Attributes:
          created_at:                 2024-11-23T22:39:12.899029+00:00
          arviz_version:              0.18.0
          inference_library:          pymc
          inference_library_version:  5.15.1
          sampling_time:              6.328435897827148
          tuning_steps:               2500

    • <xarray.Dataset> Size: 75MB
      Dimensions:            (chain: 4, draw: 1000, customer_id: 2357)
      Coordinates:
        * chain              (chain) int64 32B 0 1 2 3
        * draw               (draw) int64 8kB 0 3 6 9 12 ... 2985 2988 2991 2994 2997
        * customer_id        (customer_id) int64 19kB 1 2 3 4 ... 2354 2355 2356 2357
      Data variables:
          recency_frequency  (chain, draw, customer_id) float64 75MB -9.378 ... -0....
      Attributes:
          created_at:                 2024-11-23T22:39:18.592759+00:00
          arviz_version:              0.18.0
          inference_library:          pymc
          inference_library_version:  5.15.1

    • <xarray.Dataset> Size: 108kB
      Dimensions:   (chain: 4, draw: 1000)
      Coordinates:
        * chain     (chain) int64 32B 0 1 2 3
        * draw      (draw) int64 8kB 0 3 6 9 12 15 ... 2982 2985 2988 2991 2994 2997
      Data variables:
          accept    (chain, draw) float64 32kB 8.341 0.01704 ... 0.03161 0.627
          accepted  (chain, draw) bool 4kB True False False ... False False False
          lambda    (chain, draw) float64 32kB 0.8415 0.8415 0.8415 ... 0.8415 0.8415
          scaling   (chain, draw) float64 32kB 0.0002288 0.0002288 ... 0.001 0.001
      Attributes:
          created_at:                 2024-11-23T22:39:12.901456+00:00
          arviz_version:              0.18.0
          inference_library:          pymc
          inference_library_version:  5.15.1
          sampling_time:              6.328435897827148
          tuning_steps:               2500

    • <xarray.Dataset> Size: 57kB
      Dimensions:            (customer_id: 2357, obs_var: 2)
      Coordinates:
        * customer_id        (customer_id) int64 19kB 1 2 3 4 ... 2354 2355 2356 2357
        * obs_var            (obs_var) <U9 72B 'recency' 'frequency'
      Data variables:
          recency_frequency  (customer_id, obs_var) float64 38kB 30.0 2.0 ... 0.0 0.0
      Attributes:
          created_at:                 2024-11-23T22:39:12.902959+00:00
          arviz_version:              0.18.0
          inference_library:          pymc
          inference_library_version:  5.15.1

plotting.plot_expected_purchases(
    model=pnbd,
    purchase_history=raw_trans,
    customer_id_col="id",
    datetime_col="date",
    t=25*7,
    datetime_format="%Y%m%d",
    time_unit="W",
    plot_cumulative=True,
)
<Axes: title={'center': 'Tracking Cumulative Transactions'}, xlabel='Time Periods', ylabel='Purchases'>
../../../_images/06e4d5697384340b988a3a7cea1216f027c1e7f1b7997d00424a5da8cdd32952.png
plotting.plot_expected_purchases_ppc(pnbd, ppc='prior');
Sampling: [alpha, beta, r, recency_frequency, s]
../../../_images/9dbf46afd314ccbff8535641edd541828113ff5e9e1eaca9fd214c46fb58e22c.png
plotting.plot_expected_purchases_ppc(pnbd, ppc='posterior');
Sampling: [recency_frequency]


../../../_images/1e16a6cb79b19be414c2f08eebdf2ac4cefdaf0f0216dbed23afde4a262fbf92.png

Agregar HDIs a plot_purchase_pmf requiere que los recuentos de valores se agrupen coordenada por coordenada a lo largo de una dimensión como draws, lo cual puede no ser compatible con xarray. Luego, se puede aplicar arviz.hdi a todos los recuentos de valores para la estimación de intervalos.

# keep n_samples=1
pnbd.distribution_new_customer_recency_frequency(
        random_seed=45,
        n_samples=1,
    ).sel(obs_var="frequency")
Sampling: [recency_frequency]


<xarray.DataArray 'recency_frequency' (chain: 4, draw: 1000, customer_id: 2357)> Size: 75MB
array([[[ 2.,  0.,  0., ...,  7.,  1.,  1.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  2.,  0., ...,  0.,  4.,  0.],
        ...,
        [ 8.,  2., 10., ...,  7.,  0.,  1.],
        [ 1.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  1., ...,  2.,  6.,  0.]],

       [[ 1.,  3.,  0., ...,  1.,  2.,  0.],
        [ 0.,  0.,  0., ...,  0.,  1.,  0.],
        [ 0.,  3.,  0., ...,  0.,  2.,  4.],
        ...,
        [ 2.,  0.,  0., ...,  0.,  2.,  0.],
        [ 0.,  0.,  4., ...,  0.,  1.,  3.],
        [ 0.,  0.,  0., ...,  0.,  0.,  2.]],

       [[ 1.,  0.,  1., ...,  3.,  0.,  0.],
        [ 1.,  1.,  0., ...,  3.,  0.,  0.],
        [ 0.,  1.,  0., ...,  0.,  2.,  0.],
        ...,
        [ 0.,  0.,  0., ...,  0.,  6.,  0.],
        [ 1.,  0.,  0., ...,  0.,  1.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  1.]],

       [[ 1.,  0.,  0., ...,  0.,  0.,  1.],
        [ 3.,  0.,  0., ...,  2.,  0.,  0.],
        [ 4.,  1.,  0., ...,  2.,  5.,  0.],
        ...,
        [ 1.,  0.,  3., ...,  2.,  3.,  0.],
        [ 0.,  0.,  1., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  2.,  0.,  0.]]])
Coordinates:
  * chain        (chain) int64 32B 0 1 2 3
  * draw         (draw) int64 8kB 0 3 6 9 12 15 ... 2985 2988 2991 2994 2997
  * customer_id  (customer_id) int64 19kB 1 2 3 4 5 ... 2353 2354 2355 2356 2357
    obs_var      <U9 36B 'frequency'
import pymc as pm
import xarray as xr

# TODO: Add build_model() before a prior predictive check
pnbd.build_model()
with pnbd.model:
    prior_idata = pm.sample_prior_predictive(random_seed=45, draws=100)

# obs_var must be obtained from  prior_idata in case of an unfit model
obs_freq = prior_idata.observed_data["recency_frequency"].sel(obs_var="frequency")
ppc_freq = prior_idata.prior_predictive["recency_frequency"].sel(obs_var="frequency")#.mean(("chain","draw"))
#ppc_freq = prior_idata.prior_predictive["recency_frequency"].sel(obs_var="frequency").mean(("chain","draw"))#.rename({o

ppc_df = ppc_freq.to_dataframe()['recency_frequency'].value_counts(normalize=True).sort_index() * 100

# Percentages are the only way to compare actual counts to chain*draw simulated counts
# merged_xr.to_dataframe()["recency_frequency"].value_counts(normalize=True) * 100
# merged_xr.to_dataframe()["ppc_mean"].value_counts(normalize=True) * 100
Sampling: [alpha, beta, r, recency_frequency, s]
ppc_df.reset_index()['proportion'].head(5).plot(kind='bar')
<Axes: >
../../../_images/b4132e423558b6e356b3fce93fc895136df32d25ad4f6de1c21519b4f6eec8ce.png
from xarray.groupers import UniqueGrouper

# This will do value counts for entire DataArray
#ppc_freq.groupby(ppc_freq).count()
#ppc_freq.groupby(chain=UniqueGrouper()).count()

#value_counts = xr.DataArray(np.bincount(ppc_freq.values), dims="value")
#np.bincount(ppc_freq.values)
xr.DataArray([1, 2, 2, 3, 3, 3], dims="x")
# but what about individual customers?
# create a value count coordinate
# prior_idata.prior_predictive.coords["value_counts"] = ppc_freq.groupby(ppc_freq).count()["ppc_mean"].values
# ppc_freq = prior_idata.rename_vars({"recency_frequency":"ppc_mean"}).prior_predictive["ppc_mean"]#.sel(obs_var="frequency")#.mean(("chain","draw"))
# ppc_freq
# prior_idata.prior_predictive.groupby(value_counts=UniqueGrouper()).count().sel(obs_var="frequency")
#prior_idata.prior_predictive.groupby("customer_id").count()
#prior_idata# 
#
# #ppc_freq.coords["grouped_customer"] 
#ppc_freq.groupby(draw=UniqueGrouper()).count()
<xarray.DataArray (x: 6)> Size: 48B
array([1, 2, 2, 3, 3, 3])
Dimensions without coordinates: x
from arviz.stats import hdi

calc_hdi = hdi(
    ary=prior_idata,
    hdi_prob=.9, #param
    group='prior_predictive', #posterior
).sel(obs_var='frequency')

hdi_lo = calc_hdi.sel(hdi='lower').to_array().squeeze()
hdi_hi = calc_hdi.sel(hdi='higher').to_array().squeeze()

# TODO: Join back to ppc_freq & obs var
hdi_hi
<xarray.DataArray (customer_id: 2357)> Size: 19kB
array([6., 5., 9., ..., 6., 6., 6.])
Coordinates:
  * customer_id  (customer_id) int64 19kB 1 2 3 4 5 ... 2353 2354 2355 2356 2357
    obs_var      <U9 36B 'frequency'
    hdi          <U6 24B 'higher'
    variable     <U17 68B 'recency_frequency'