import os
from datetime import datetime, timedelta, timezone
import pandas as pd
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_colwidth', 400)
import pymssql
import sqlalchemy
import pandas_profiling as pp
# import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
%matplotlib inline
from sqlalchemy import create_engine
from sqlalchemy.orm import Session
engine = create_engine(os.environ['NOTEBOOK_DB'])
For Azure SQL Connections, the full username is <user>@<server>
E.g.:
<protocol>://<user>:<pass>@<server>/<database>
mssql+pymssql://username@servername:password@servername.database.windows.net/database_name?charset=utf8
server = os.environ['SERVER']
database = os.environ['DATABASE']
username = os.environ['USERNAME']
password = os.environ['PASSWORD']
driver= '{ODBC Driver 17 for SQL Server}'
cstr = f'DRIVER={driver};SERVER={server};PORT=1433;DATABASE={database};UID={username};PWD={password};Connection Timeout=30'
params = urllib.parse.quote_plus(cstr)
conn_str = 'mssql+pyodbc:///?odbc_connect={}'.format(params)
def get_log_data():
sql = f'''SELECT TOP 1000000
[Id],convert(varchar(50), [Date], 121) as LogDate,
[Date] as TestDate,
[Thread]
,[Level],[Logger],[Event]
,[Context],[Message],convert(varchar(50), [DeviceDate], 121) as DeviceDate
,[Exception],[EndpointId],[CsrId]
,[FacilityId],[CallId]
FROM [dbo].[Logging]
ORDER BY Id DESC'''
return pd.read_sql(sql, engine)
tmp = get_log_data()
tmp.head()
tmp.dtypes
tmp.describe()
pp.ProfileReport(tmp)
tl;dr - Query as a string, parse date in Python
https://pymssql.readthedocs.io/en/stable/freetds_and_dates.html
def clean_log_data(df):
df = df.fillna(0)
df['LogDate'] = pd.to_datetime(df['LogDate'])
df['DeviceDate'] = pd.to_datetime(df['DeviceDate'])
df = df[df['Message'] != ' ']
# Return the day of the week as an integer, where Monday is 0 and Sunday is 6.
df['dayOfWeek'] = df['LogDate'].apply(lambda x: x.date().weekday())
df['date'] = df['LogDate'].apply(lambda x: x.strftime('%Y-%m-%d'))
df['hour'] = df['LogDate'].dt.hour
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
return df
def get_endpoints():
sql = f'''
SELECT
[EndpointId],[FacilityId],[EndpointName],[EndpointType],[Login],[Password]
,[HasMap],[HasPhoto],[InvertCamera],[AudioBandwidth],[VideoBandwidth]
,[SupervisorPhone],[AudioInputGain],[AudioOutputGain],[Hardware]
,[InvertDisplay],[Manufacturer],[Model],[InboundPhone]
,[GateVendDigits],[Version],[ExternalCameraURL],[IdlePageText]
,[UseStaticImage],[BrainSerialNumber],[DisplaySerialNumber],[InServiceDate]
,[ExternalMonitoring],[Notes],[AllowGateLock],[Deleted]
,[InboundPhoneType]
FROM [dbo].[Endpoints]
'''
return pd.read_sql(sql, engine)
def get_calls_from_hour_ago(hourago):
sql = """
select
convert(varchar, start, 121) as Start,
convert(varchar, [end], 121) as [endDate],
CallId, EndpointId, FacilityName, EndpointName, SecondsToAnswer
from Calls
where Start >= '{0}'
order by CallId desc;""".format(hourago.strftime("%Y-%m-%d %H:%M:%S"))
calls = pd.read_sql(sql, engine)
calls['Start'] = pd.to_datetime(calls['Start'])
calls['endDate'] = pd.to_datetime(calls['endDate'])
time = hourago + timedelta(hours=1)
return calls.loc[calls['Start'] < time].loc[calls['Start'] > hourago]
hour_ago = datetime(2019, 6, 3, 3, 0, 0)
hour_ago = hour_ago.replace(tzinfo=timezone.utc)
calls = get_calls_from_hour_ago(hour_ago)
callIds = calls.loc[calls['SecondsToAnswer'] == 9999]['CallId'].unique()
for callId in callIds:
c = calls.loc[calls['CallId'] == callId]
print(f'Missed Facility: {c["FacilityName"].values} -> Endpoint: {c["EndpointName"].values}')
missed_calls = calls.loc[calls['SecondsToAnswer'] == 9999]
# missed_calls
for i in range(len(missed_calls)):
print(f'Missed Facility: {missed_calls.loc[i:,"FacilityName"]}')
full_message_types = [
'SIGNALR Error: Error: WebSocket closed.',
'START Endpoint Page Loaded','SIGNALR signalR start',
'ANSWER ANSWER message received',
'CALL Call button push detected','AJAX POST EndpointLogs failed',
'ENDCALL ENDCALL message received','getUserMedia local audio/video stream aquired',
'SIGNALR Disconnected','SignalR SignalR Connection Drop',
'Initializing Media Streams Start','RINGER Begin ringing',
'StartWebRTC starting','StartWebRTC connection initialized',
'version: 52','EVENT Call button pressed','SIGNALR Connection slow',
'HOLDCALL Placing on hold', 'NODE readbutton cmd failed', 'Exception in SendWebRTCMessage',
'OPENGATE OPENGATE command received', 'SIGNALR Error: Error: Failed to ping server.',
'AJAX SUCCESS get endpoint success: 05-141,v1.5.6_16.04.1 LTS,v1.5.5,\n\nOK\n'
]
starts_with_types = [
'Initializing endpoint status complete'
,'Endpoint login OK, Login'
,'AJAX ERROR Open Gate Error: OPENGAT'
,'AJAX SUCCESS get endpoint success'
,'GetGateVendLogs took'
,'GetGateVendLogsExport took:'
,'SIGNALR Error: Error: The client has been inactive since'
,'User login OK. Login'
,'Error'
,'Exception'
,'User login failed. Login'
,'Diagnostic Version info'
]
def alert_threshold(df, groupby=['date', 'EndpointId']):
thresholds = []
for m in full_message_types:
msg = df.loc[df['Message'] == m]
msg = msg.groupby(groupby)
results = pd.DataFrame()
results['counts'] = msg.count().sort_values('Id', ascending=False)['Id']
if results.empty:
continue
std_dev = results.std()['counts']
avg = results.mean()['counts']
thresholds.append((m, int(avg + 2 * std_dev), avg))
return thresholds
def get_failing_endpoints(log_df, day, thresholds, endpoints_df):
endpoints = log_df['EndpointId'].unique()
today_df = log_df.loc[log_df['date'] == day]
time = datetime.utcnow()
hourago = time + timedelta(hours=-1)
hourago = hourago.replace(tzinfo=timezone.utc)
calls = get_calls_from_hour_ago(hourago)
failing_endpoints = {}
for e in endpoints:
endpdf = today_df.loc[today_df['EndpointId'] == e]
call_count = calls.loc[calls['EndpointId'] == e].count().values[0]
for t in thresholds:
msgdf = endpdf.loc[endpdf['Message'] == t[0]]
if msgdf.empty:
continue
hour_ago_count = msgdf.loc[msgdf['LogDate'] < hourago].count().values[0]
now_count = msgdf.count().values[0]
last_hour_count = now_count - hour_ago_count
if last_hour_count > t[1] and call_count < t[1]:
endpoint_name = endpoints_df.loc[endpoints_df['EndpointId'] == e]['EndpointName'].values[0]
if e not in failing_endpoints.keys():
failing_endpoints[e] = {'name': endpoint_name, 'messages': [], 'total_delta': 0}
failing_endpoints[e]['messages'].append( {
'message': t[0],
'expected': t[1],
'actual': last_hour_count,
'delta': last_hour_count - t[1]
})
failing_endpoints[e]['total_delta'] += last_hour_count - t[1]
failing_endpoints[e]['call_count'] = call_count
return failing_endpoints
log_df = get_log_data()
log_df = clean_log_data(log_df)
endpoints_df = get_endpoints()
thresholds = alert_threshold(log_df, ['date', 'EndpointId', 'hour'])
endpoints = log_df['EndpointId'].unique()
today = datetime.now().strftime('%Y-%m-%d')
today_df = log_df.loc[log_df['date'] == today]
thresholds = alert_threshold(log_df, ['date', 'EndpointId', 'hour']) # List of Tuples: [('Message', ExpectedCount, Average),...]
failing_endpoints = get_failing_endpoints(log_df, today, thresholds, endpoints_df)
sorted_endpoints = sorted(
failing_endpoints.keys(),
key=lambda x: failing_endpoints[x]['total_delta'],
reverse=True)
for e in sorted_endpoints:
if failing_endpoints[e]['total_delta'] < 25:
print('Endpoint ID: %s - Total delta less than 30, move on, delta: %s, calls: %s', e, failing_endpoints[e]['total_delta'], failing_endpoints[e]['call_count'])
continue
msg = f'\nId: {e} - {failing_endpoints[e]["name"]} total delta: {failing_endpoints[e]["total_delta"]:.0f} Calls: {failing_endpoints[e]["call_count"]}'
for m in failing_endpoints[e]['messages']:
msg += f'\n\tMesage: {m["message"]} - expected: {m["expected"]} - actual: {m["actual"]}'
print(msg)
a = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
start, stop = 2, 3
a[start:stop] # items start through stop-1
a[start:] # items start through the rest of the array
a[:stop] # items from the beginning through stop-1
a[:] # a copy of the whole array
a[-1] # last item in the array
a[-2:] # last two items in the array
a[:-2] # everything except the last two items