Stumpy - computes 'Matrix Profile', to enable data mining timeseries

Hello all,
Been a long time.
Code computes the Matrix Profile,
then finds the best motif (pattern) and discord (outlier).
Other matches can be produced.

The motif and discord regions of interest are highlighted as green/red rectangles using gfx.
The reference site(s) has examples and tutorials which work as described.

There is opportunity to use GPU and dask (also distributed) to accelerate computations
Ability to stream data and update Matrix Profile on the fly.

Hope this helps start the investigations, since there are many applications and ideas to try
regards
FE.

example display here
Screenshot 2022-04-29 142115

AFL file below

_N(pdir = "C:\\Proj\\Stumpy\\");	// proj directory
_N(pjn = "Stumpy");				    // project name
_N(pfn = pjn+" (1.0.0)");			// python filename
_N(ctxt = pjn+" (1.0.0)");			// context-this AFL filename
/* DESC
Via chart parameter settings
	- Clean signal / normalise 
	- plot
	- plot timeseries estimate
- pyPlot
*/
/* Notes
- enable in python script (amiArgs,amiRtn),
  allows amiPy print of AFL inputs and return matrix in interpretation window
- display data & results in python
  allows python to save results to file
- full barindex data used
*/

//////////////////////////////////////////////////////////////////////////////
// LOGGING
//PyChangeLoggingOptions("","wsemd");

//////////////////////////////////////////////////////////////////////////////
// PARAMETERS
pSig = ParamTrigger("Determine Matrix Profile","Click");
// pass data --> save to csv/noSave  --> find_profile using df
// use csv   --> read TS csv         --> get_profile using df
// prepare TS--> either read TS, df  --> find_profile using df
// quotes    --> select col(s) as TS --> find_profile using df
pPass = ParamList("Pass data","Yes|Yes_CSV|Use_CSV|Only_Prepare|Quotes",0);
pNbars = Param("Last N Bars Closing Prices",20,10,1000,1);

pWind = Param("Window Length",32,3,2500,15);
_N(startDate = ParamDate("Start Date","1-1-2021",1));
_N(endDate = ParamDate("End Date","28-7-2022",1));

pChart = ParamToggle("Chart result","N|Y",0);
pHat = ParamToggle("Chart Estimate Signal","N|Y",0);

pAsv = ParamTrigger("Clear StaticVars (Proj)","Click");
pSsv = ParamTrigger("Clear StaticVars (Stock)","Click");
pPsv = ParamToggle("Persistent StaticVars","N|Y",0);


//////////////////////////////////////////////////////////////////////////////
// CONSTANTS

//////////////////////////////////////////////////////////////////////////////
// VARIABLES
bi  = BarIndex();
DN  = DateNum();

fvb = FirstVisibleValue(bi);
lvb = LastVisibleValue(bi);
lvx = Status("lastvisiblebar"); // barcount-1+blanks
blank = lvx-lvb;
tot	= lvb-fvb+1;

//////////////////////////////////////////////////////////////////////////////
// TICKER
// remove Norgate suffix
cnt = StrFind(Name(),".au");
if(cnt!=0) { ticker = StrLeft(Name(),cnt-1); }
else { ticker = Name(); }

// INITIALISE
// removes staticvars from project
if(pAsv) {
	StaticVarRemove(pjn+"*");
}
// removes staticvars from stock
if(pSsv) {
	StaticVarRemove(ctxt+"_"+ticker+"*");
}

//////////////////////////////////////////////////////////////////////////////
// INPUTS
field = Close;

//////////////////////////////////////////////////////////////////////////////
// lOAD PYTHON SCRIPT
PyLoadFromFile(ctxt,pdir+pfn+".py");

//////////////////////////////////////////////////////////////////////////////
// arrange the data
// pass data --> save to csv/noSave --> find_profile using df
if(pPass=="Yes" OR pPass=="Yes_CSV") {
    sz_df = PyEvalFunction(ctxt,"Get_TS",DN,field,ticker);
	// optionally, save timeseries as csv
	if(sz_df>0 AND pPass=="Yes_CSV") {
		PyEvalFunction(ctxt,"Save_TS_csv");
	}
}
else if(pPass=="Use_CSV") {
  // use csv --> read TS csv --> get_profile using df
  PyEvalFunction(ctxt,"Get_TS_csv",ticker);
}
else if(pPass=="Only_Prepare") {
  // prepare TS --> either read TS, df --> find_profile using df
  // Prepare Data for date range
  PyEvalFunction(ctxt,"Prepare_TS",startDate,EndDate);
}
else if(pPass=="Quotes") {
  // quotes --> then select col(s) to use as TS --> find_profile using df
  PyEvalFunction(ctxt,"Quotes",O,H,L,C,V);
}

//////////////////////////////////////////////////////////////////////////////
//matrix_profile structure:
//mp-col 0 -> motif values
//  -col 1 -> idx to nearest (either direction)
//  -col 2 -> idx to nearest left side
//  -col 3 -> idx to nearest right side
tmp = Null;
if(pSig) {	// Find/Get the matrix_profile returned as [matrix]?
  Profile = PyEvalFunction(ctxt,"Get_Profile",pWind);
  StaticVarSet(ctxt+"_"+ticker+"_mp",Profile,pPsv);

//if(typeof(Profile)=="matrix") {
  // get the Motif index, Discord index & and their nearest_neighbour index
  motif = PyEvalFunction(ctxt,"Find_Motif",pWind);
  motif_idx = motif[0][0];
  nearest_motif = motif[1][0];
  discord = PyEvalFunction(ctxt,"Find_Discord",pWind);
  discord_idx = discord[0][0];
  nearest_discord = discord[1][0];
  // save the findings
  StaticVarSet(ctxt+"_"+ticker+"_motif_idx",motif_idx,pPsv);
  StaticVarSet(ctxt+"_"+ticker+"_nearest_motif",nearest_motif,pPsv);
  StaticVarSet(ctxt+"_"+ticker+"_discord_idx",discord_idx,pPsv);
  StaticVarSet(ctxt+"_"+ticker+"_nearest_discord",nearest_discord,pPsv);
  // print the findings
  printf("\nThe motif idx is %g",motif_idx);
  printf("\nThe nearest motif idx is %g",nearest_motif);
  printf("\nThe discord idx is %g",discord_idx);
  printf("\nThe nearest discord idx is %g",nearest_discord);
//}
//else { rows = cols = 0; }
}

// get saved matrix profile, use overlay to display motifs & anomolies
mp = StaticVarGet(ctxt+"_"+ticker+"_mp");
rows = MxGetSize(mp,0);;
cols = MxGetSize(mp,1);
motif_idx = StaticVarGet(ctxt+"_"+ticker+"_motif_idx");
nearest_motif = StaticVarGet(ctxt+"_"+ticker+"_nearest_motif");
discord_idx = staticVarGet(ctxt+"_"+ticker+"_discord_idx");
nearest_discord = StaticVarGet(ctxt+"_"+ticker+"_nearest_discord");


//////////////////////////////////////////////////////////////////////////////
// CHARTING
SetChartBkColor(ColorRGB(0,0,0));
SetChartOptions(0,chartShowArrows|chartShowDates);
Plot(C,"C",colorYellow,styleCandle|styleNoTitle,Null,Null,0,0,1);
 

///////////////////////////////////////////////////////////////////////////////
// Graphics parameters
GfxSetOverlayMode(0); 				// low-level graphic is overlaid on top of charts
GfxSetCoordsMode(3);  				// X coordinate is bar index, Y is pixel
GfxSelectPen(colorYellow); 
GfxSelectSolidBrush(colorYellow);

pxchartbottom = Status("pxchartbottom");
pxcharttop = Status("pxcharttop"); 
pxchartheight = Status("pxchartheight"); 
//---------------------------------------------------------------------------------------
// plot matrix profile
aMp = MxGetBlock(mp,0,rows-1,0,0,True); // matrix profile value as array
miny = Lowest(aMp);
maxy = Highest(aMp);
pxScaleY = (pxchartbottom-pxcharttop-60)/(maxy-miny);
pxMP = pxchartbottom - 0.955*((aMp-miny) * pxScaleY + miny) - 30;

GfxMoveTo(fvb,pxMP[fvb]); // since current cursor is at last visible bar price)
for(i=fvb; i<=lvb; i++) {
	if(i>motif_idx AND i<=motif_idx+pWind) { //hightlight motif region
		GfxSelectPen(colorBrightGreen,2); 
		GfxLineTo(i, pxMP[i]);
	}
	if(i>nearest_motif AND i<=nearest_motif+pWind) { //hightlight nearest motif region
		GfxSelectPen(colorBrightGreen,2); 
		GfxLineTo(i, pxMP[i]);
	}
	if(i>discord_idx AND i<=discord_idx+pWind) { //hightlight discord region
		GfxSelectPen(colorRed,2); 
		GfxLineTo(i, pxMP[i]);
	}
	if(i>nearest_discord AND i<=nearest_discord+pWind) { //hightlight nearest discord region
		GfxSelectPen(colorRed,2); 
		GfxLineTo(i, pxMP[i]);
	}
	GfxSelectPen(colorWhite,1); 
    GfxLineTo(i, pxMP[i]);
}
GfxSelectPen(colorGreen, 2, 3 ); 
GfxSetTextAlign(6); GfxSetTextColor(colorGreen);
GfxTextOut("Motif (bi= "+motif_idx+")", motif_idx, pxchartbottom-30);
GfxTextOut("nearest_Motif (bi= "+nearest_motif+")", nearest_motif, pxchartbottom-30);

GfxSelectPen(colorRed, 2, 3 ); 
GfxSetTextAlign(6); GfxSetTextColor(colorRed);
GfxTextOut("Motif (bi= "+discord_idx+")", discord_idx, pxcharttop+30);
GfxTextOut("nearest_discord (bi= "+nearest_discord+")", nearest_discord, pxcharttop+30);
//---------------------------------------------------------------------------------------
// plot vertical dashed lines/rectangle for motif and nearest bars
GfxSelectPen(colorBrightGreen, 3, 2 );
GfxMoveTo(motif_idx, pxchartbottom-20);
GfxLineTo(motif_idx, pxcharttop+20);
GfxLineTo(motif_idx+pWind, pxcharttop+20);
GfxLineTo(motif_idx+pWind, pxchartbottom-20);
GfxLineTo(motif_idx, pxchartbottom-20);

GfxMoveTo(nearest_motif, pxchartbottom-20);
GfxLineTo(nearest_motif, pxcharttop+20);
GfxLineTo(nearest_motif+pWind, pxcharttop+20);
GfxLineTo(nearest_motif+pWind, pxchartbottom-20);
GfxLineTo(nearest_motif, pxchartbottom-20);

// plot vertical dashed lines for discord and nearest bars
GfxSelectPen(colorRed, 3, 2 );
GfxMoveTo(discord_idx, pxchartbottom-20);
GfxLineTo(discord_idx, pxcharttop+20);
GfxLineTo(discord_idx+pWind, pxcharttop+20);
GfxLineTo(discord_idx+pWind, pxchartbottom-20);
GfxLineTo(discord_idx, pxchartbottom-20);

GfxMoveTo(nearest_discord, pxchartbottom-20);
GfxLineTo(nearest_discord, pxcharttop+20);
GfxLineTo(nearest_discord+pWind, pxcharttop+20);
GfxLineTo(nearest_discord+pWind, pxchartbottom-20);
GfxLineTo(nearest_discord, pxchartbottom-20);


//////////////////////////////////////////////////////////////////////////////
// TITLE
_N(Title = StrFormat("<<"+pdir+pfn+".py"+">>  <<Context: "+ctxt+">>"+
					"\n{{NAME}} - {{INTERVAL}} {{DATE}},"+
					" Open %g, Hi %g, Lo %g, Close %g (%.1f%%)"+
					"\nstartDate: "+StartDate+"  endDate: "+EndDate+
					"\nszRows= %g,   szCols= %g",
					O, H, L, C, SelectedValue(ROC(C,1)), rows, cols ));
// thanks @beppe					 
_N(Title = Title+"\nfvb(bi):     "+fvb+ "      lvb(bi):   "+lvb+"    bars(bi):  "+tot);
_N(Title = Title+"\nlvb(status): "+lvx+"  BLANKS: " + blank);
_N(Title = Title+"  Barcount: " + BarCount);
_N(Title = Title + StrFormat("\n{{VALUES}}"));
type or paste code here

python code here

# -*- coding: utf-8 -*-
"""
Created on Tue Jul 27 13:51:07 2021
@author: FE

"""
import sys
from numpy.random import f
from pyparsing import col

# proj folder
proj_path = "C:\\Proj\\Stumpy\\"
data_path = proj_path+"data\\"
images_path = proj_path+"images\\"
    
# data folder
if data_path not in sys.path:
    sys.path.append(data_path)

# images folder
if images_path not in sys.path:
    sys.path.append(images_path)


import stumpy
import numpy as np
import pandas as pd
from numba import cuda
from numba.cuda.simulator.api import list_devices

import matplotlib.pyplot as plt
import matplotlib.dates as dates
from matplotlib.patches import Rectangle
import datetime as dt

#plt.style.use('https://raw.githubusercontent.com/TDAmeritrade/stumpy/main/docs/stumpy.mplstyle')
plt.style={'figsize': (20, 6), 'xtick.direction' : 'out'}

import AmiPy

global df  #each on own line
global df1
global matrix_profile
df = pd.DataFrame()
df1 = pd.DataFrame()
matrix_profile = []

# plot signal and results
dspy = False
# print AmiPy to AFL messages
amimsgs = False
amiArgs = False
amiRtn = True

# plot style - Make the graphs a bit prettier, and bigger
#pd.set_option('display.mpl_style', 'default')
#plt.style={'figsize': (20, 6), 'xtick.direction' : 'out'}
plt.rcParams['figure.figsize'] = (15, 6)

def Sel_From_Quotes(sel):
    # select TimeSeries from list of columns which saved as quotes
    #TS_Name = df.columns #get the available column names as a list
    #colname = TS.Name.iloc(sel)  #select the desired column
    #todo - able to include multiple dimensions
    #return df.array[columns=colname]   #return the column from quotes to be the timeseries
    pass


def Quotes(Oarr,Harr,Larr,Carr,Varr):
    global df
    # quotes[OHLCV]
    df['Open'] = Oarr
    df['High'] = Harr
    df['Low'] = Larr
    df['Close'] = Carr
    df['Volume'] = Varr
    #AmiPy.Print( "Quotes to DataFrame success\n" )


# create Quotes [DateTime,O,H,L,C,V] dataframe, with dateTime as index
def Get_TS(DNarr, arrTS, tsLabel):
    global df
    df = pd.DataFrame()
    df.Name = tsLabel

    # Use AFL-DateNum: 10000 * (year - 1900) + 100 * month + day
    df['year'] = (DNarr/10000 + 1900).astype(int)
    df['month'] = (DNarr/100 % 100).astype(int)
    df['day'] = (DNarr % 100).astype(int)
    # covert to date from columns=year,month,day[hour,minute,second,millisec,microsecond]
    df['Date'] = pd.to_datetime(df)
    df.drop(columns=['year','month','day'], inplace=True)
    
    df['TS'] = arrTS.astype(np.float64)
    return df.shape[0]

# Prepare dataset from timeseries
def Prepare_TS(start_date,end_date, tsLabel):
    global mask
    global df1
    #df1 = pd.DataFrame()
    df1.Name = tsLabel

    mask = (df['Date'] >= start_date) & (df['Date'] <= end_date)
    df1 = df.loc[mask]

    df1['Predict'] = 0.0;
    df1['Predict'] = 0.0;
    df1['Forecast'] = 0.0;
    df1['Forecast'] = 0.0;
    #if amiRtn:
    #    AmiPy.Print("\nSeries= "+np.str_(df1.head))
    #return df1.shape[0]


# save Timeseries (df.Name exists since Get_TS () has run)
def Save_TS_csv():
    global df
    df.to_csv(data_path+df.Name+'.csv',index=False) #saves without an index
    #return df.shape[0]



# Get Timeseries (needs tsLabel passed by AmiBroker)
def Get_TS_csv(tsLabel):
    global df  # required since 1st use hasn't bound df
    df = pd.read_csv(data_path+tsLabel+'.csv', index_col=0, infer_datetime_format=True)  #date is index
    df.Name = tsLabel
    #return df.shape[0]


# Evaluate - obtain the best fit or anamoly (least fit)
def Find_Motif(wind):
    global motif_idx, nearest_motif_distance
    
    # Best match - find the matching motif (smallest value after sort)
    motif_idx = np.argsort(matrix_profile[:, 0])[0]
    # and the nearest neighbor to this motif has a distance:
    nearest_motif_distance = matrix_profile[motif_idx, 1]

    #show_Motif(wind,20)
    #if amimsgs:
    #    AmiPy.Print(f"\nThe motif is located at index {motif_idx}")
    #    AmiPy.Print(f"\nThe nearest neighbor subsequence to this motif is {nearest_motif_distance} units away")

    Best = np.array([[motif_idx], [nearest_motif_distance]],dtype='int32')

    #if amiRtn:
    #    AmiPy.Print("\nBest= "+np.str_(Best))
    return Best


def Find_Discord(wind):
    global discord_idx, nearest_discord_distance
    # Least match or Anomoly
    discord_idx = np.argsort(matrix_profile[:, 0])[-1]
    # and the nearest neighbor to this discord has a distance that is quite far away:
    nearest_discord_distance = matrix_profile[discord_idx, 1]

    #Show_Anomoly(wind,20)
    #if amimsgs:
    #    AmiPy.Print(f"\nThe discord is located at index {discord_idx}")
    #    AmiPy.Print(f"\nThe nearest neighbor subsequence to this discord is {nearest_discord_distance} units away")

    Least = np.array([[discord_idx], [nearest_discord_distance]],dtype='int32')

    #if amiRtn:
    #    AmiPy.Print("\nLeast= "+np.str_(Least))
    return Least


# Visualise the Timeseries, nearest_neighbour & Motif
# rectangle width=wind, rectangle height=20
def show_Motif(m,w=20):
    # let’s put all of this together and plot the matrix profile next to our raw data:
    fig, axs = plt.subplots(2, sharex=True, gridspec_kw={'hspace': 0})
    plt.suptitle('Motif (Pattern) Discovery', fontsize='30')
    axs[0].plot(df['TS'].values)
    axs[0].set_ylabel('Timeseries', fontsize='20')
    rect = Rectangle((motif_idx, 0), m, w, facecolor='lightgrey')
    axs[0].add_patch(rect)
    rect = Rectangle((nearest_motif_distance, 0), m, w, facecolor='lightgrey')
    axs[0].add_patch(rect)
    axs[1].set_xlabel('Time', fontsize ='20')
    axs[1].set_ylabel('Matrix Profile', fontsize='20')
    axs[1].axvline(x=motif_idx, linestyle="dashed")
    axs[1].axvline(x=nearest_motif_distance, linestyle="dashed")
    axs[1].plot(matrix_profile[:, 0])
    plt.show()


# Visualise the Timeseries & Anomoly (discord)
def Show_Anomoly(m,w=20):
    # The subsequence located at this global maximum is also referred to as a discord, novelty, or “potential anomaly”:
    fig, axs = plt.subplots(2, sharex=True, gridspec_kw={'hspace': 0})
    plt.suptitle('Discord (Anomaly/Novelty) Discovery', fontsize='30')

    axs[0].plot(df['TS'].values)
    axs[0].set_ylabel('Timeseries', fontsize='20')
    rect = Rectangle((discord_idx, 0), m, w, facecolor='lightgrey')
    axs[0].add_patch(rect)
    axs[1].set_xlabel('Time', fontsize ='20')
    axs[1].set_ylabel('Matrix Profile', fontsize='20')
    axs[1].axvline(x=discord_idx, linestyle="dashed")
    axs[1].axvline(x=nearest_discord_distance, linestyle="dashed")
    axs[1].plot(matrix_profile[:, 0])
    plt.show()


# Find Snippets, to generate a model of the timeseries
def Find_Snippets():
    pass


# Check for Timeseries chain
def Check_ATSC():
    pass


# Determine Matrix Profile
def Get_Profile(wind):
    global df, matrix_profile
    #window_size = 32  # which is approximately how many data points might be found in a pattern
    wind = np.int_(wind)
    matrix_profile = stumpy.stump(df['TS'].array, m=wind)

    if amiRtn:
        #AmiPy.Print("\n['Date'] Type is "+np.str_(df['Date'].dtype))
        AmiPy.Print("\n(nrow,col)= "+np.str_(df.shape))
        #AmiPy.Print("\nProfile= "+np.str_(matrix_profile))
    return matrix_profile


# Determine Matrix Profile - the model
def Find_Profile(tsLabel, wind):
    global df, matrix_profile
    df =pd.DataFrame()
    # get the csv timeseries, assumes Get_TS already done, so df exists
    Get_TS()
    #window_size = 50  # Approximately how many data points might be found in a pattern
    wind = np.int_(wind)
    matrix_profile = stumpy.stump(df['TS'].array, m=wind)

    if amiRtn:
        AmiPy.Print("\n['Date'] Type is "+np.str_(df['Date'].dtype))
        AmiPy.Print("\n(nrow,col)= "+np.str_(df.shape))
        AmiPy.Print("\nProfile= "+np.str_(matrix_profile))
    return matrix_profile


# Get Matrix Profile using TS(csv) dataset
def Get_Profile(tsLabel, wind):
    # get the csv timeseries
    df.Name = tsLabel
    Get_TS_csv()
 
    wind = np.int_(wind)
    matrix_profile = stumpy.stump(df['TS'].array, m=wind)

    #if amiRtn:
    #    AmiPy.Print("\n['Date'] Type is "+np.str_(df['Date'].dtype))
    #    AmiPy.Print("\n(nrow,col)= "+np.str_(df.shape))
    #    AmiPy.Print("\nProfile= "+np.str_(matrix_profile))
    return matrix_profile


5 Likes

Reference links:
https://stumpy.readthedocs.io/en/latest/Tutorial_The_Matrix_Profile.html

FE

3 Likes