record count histogram distribution

Last Post 07 Nov 2008 03:21 AM by SwePeso. 4 Replies.
AddThis - Bookmarking and Sharing Button Printer Friendly
  •  
  •  
  •  
  •  
  •  
Sort:
PrevPrev NextNext
You are not authorized to post a reply.
Author Messages
johnltice
New Member
New Member

--
06 Nov 2008 12:10 PM
Hi.

I am curious to know if there is a set-based solution to a problem I need to solve.

I have a table called LogEvent(LogDate datetime, LogData nvarchar(40)). This table contains a little over 3 million rows with unknown/unpredictable gaps in LogDate (which is the clustered index). I need to determine the lower and upper bounds of LogDate for each x number of records. For instance if x = 50,000, then for every 50,000 records I would like something like a histogram (200,225 row example given):

RecordCount LowLogDate HighLogDate
----------- ---------- -----------
0-50000 2000-01-01 2001-06-13
50001-100000 2001-06-13 2001-08-24
100001-150000 2001-08-24 2001-10-13
150001-200000 2001-10-13 2001-10-31
200001-200225 2001-10-31 2001-10-31

using ntile, the following mostly works:
declare @stepcount int, @chunksize int
set @chunksize = 50000
select @stepcount = count(LogDate) / @chunksize + case when count(LogDate) % @chunksize <> 0 then 1 else 0 end from LogEvent;


with cteLogHistogram as (
select
perxxtile = ntile(@stepcount) over(order by LogDate),
date = convert(char(10), LogDate, 120)
from LogEvent
)
select perxxtile, lb = min(date), ub = max(date), count(*) from cteLogHistogram group by perxxtile order by perxxtile;

This isn't exactly what I want since ntile(@stepcount) evenly divides all records. I want 50,000 records for steps 1 through (n-1) and then step n = 50,000 % n

What I've tried (largely based on Itzik Ben-Gan's examples in 'Inside Microsoft SQL Server 2005: T-SQL Querying') is the following:

drop table dbo.Nums
create table dbo.Nums(n bigint not null primary key)
declare @max int, @rc int
set @max = 3000000
set @rc = 1
insert into nums values(1)
while @rc * 2 <= @max
begin
insert into nums select n + @rc from nums
set @rc = @rc * 2
end
insert into nums select n + @rc from nums where n + @rc <= @max
go


ALTER function dbo.fn_histsteps(@numberofsteps int)
returns table as
return
select
step = n * stepsize,
lb = dateadd(dd, (n-1) * stepsize, mn),
ub = dateadd(dd, n * stepsize, mn)
from dbo.nums
cross join (
select
mn = min(a.LogDate),
stepsize = datepart(dd, dateadd(mm, 1, datediff(dd,min(a.LogDate), max(a.LogDate)))) / @numberofsteps
from dbo.LogEvent a
) d where n<=@numberofsteps
go


declare @stepcount int select @stepcount = count(LogDate) / 50000 from LogEvent
select step, [count] = count(a.LogDate)
from dbo.fn_histsteps(@stepcount) s join LogEvent a on a.LogDate >= lb and a.LogDate < ub
group by step

(0 row(s) affected)

The problem I'm having is that my stepsize in fn_histsteps is always 0, but I can't see a non-cursor solution for calculating the next LogDate where the count of records = 50000, since the currently evaluated LogDate. Any insight on how I can change this would be helpful.

Thanks,

John
SwePeso
New Member
New Member

--
07 Nov 2008 03:21 AM
Why overcomplicate things?
I don't know which parts are from Itzik but I believe this is easier to maintain and understand.

-- Prepare sample data
CREATE TABLE #logEvent
(
logDate DATETIME,
logData NVARCHAR(40)
)

INSERT #logEvent
(
logDate,
logData
)
SELECT ABS(CHECKSUM(NEWID())) % 524288,
NEWID()
FROM master..spt_values AS v
INNER JOIN master..spt_values AS w ON w.Type = 'P'
AND w.Number < 512
WHERE v.Type = 'P'
AND v.Number < 512

CREATE CLUSTERED INDEX IX_logDate ON #logEvent (logDate)

-- Prepare user supplied chunk parameter value
DECLARE @Chunk INT

SET @Chunk = 50000

-- Display the data
SELECT MIN(recID) AS lowRange,
MAX(recID) AS highRange,
MIN(logDate) AS lowLogDate,
MAX(logDate) AS highLogDate
FROM (
SELECT logDate,
ROW_NUMBER() OVER (ORDER BY logDate) - 1 AS recID
FROM #logEvent
) AS d
GROUP BY recID / @Chunk
ORDER BY recID / @Chunk

-- Clean up
DROP TABLE #logEvent
SwePeso
New Member
New Member

--
07 Nov 2008 03:29 AM
Create your function like this instead.

CREATE FUNCTION dbo.fn_HistSteps
(
@numberOfSteps INT
)
RETURNS TABLE
AS
RETURN SELECT recID / @numberOfSteps AS stepNum,
MIN(recID) AS lowRange,
MAX(recID) AS highRange,
MIN(logDate) AS lowDate,
MAX(logDate) AS highDate
FROM (
SELECT logDate,
ROW_NUMBER() OVER (ORDER BY logDate) - 1 AS recID
FROM logEvent
) AS d
GROUP BY recID / @numberOfSteps
johnltice
New Member
New Member

--
07 Nov 2008 09:43 AM
THANKS!!!
SwePeso
New Member
New Member

--
07 Nov 2008 03:37 PM
You're welcome.
It is quite easy when you know your math.
You are not authorized to post a reply.

Acceptable Use Policy