请教去除一个几万行的文本文件数据重复的算法 Delphi / Windows SDK/APIhttp://www.delphi2007.net/DelphiBase/html/delphi_20061211021429203.html
有一个几万行数据的文本文件,一行为一个数据,请问用什么算法去掉重复的数据才最有效率呢,自己弄的好慢,谢谢各位帮助。
顶上去,继续请教
只会笨办法。
1:放在数据库里,让数据库帮你去掉重复
2:先排序,然后在一次循环找出重复的。至于排序的方式则有很多的选择。
对,先排序,后排除
排序就是例如放到一个 TSTRINGLIST里吗?
select distanct
排序然后排出比较简单,算法复杂度O(N*Log(N))
(不知道重复率有多大?)
利用Hash值来进行快速排出,算法复杂度接近O(N):
1.建立一个TNode数组(大小与不重复的数据量相当,或大些也可以) TNode=record Str:Pchar PNext : ^TNode end; (单向链表)
2.计算每个数据S的Hash值,映射到数组元素I, 如果I.PChar=nil,则I.PChar=s; 否则对比这个单向链表的所有字符串值,看S是否已经有了,没有的话添加到I的最后面;
同意楼上
Hash+排序
procedure TForm1.Button1Click(Sender: TObject);
var
AInput : TStringList;
AOutput : TStringList;
iLoop : Integer;
sTemp : String;
begin
AInput := TStringList.Create;
try
AInput.LoadFromFile('C:\Input.TXT');
AOutput := TStringList.Create;
try
for iLoop := 0 to AInput.Count - 1 do
begin
sTemp := AInput.Strings[iLoop];
if AOutput.IndexOf(sTemp) < 0 then AOutput.Add(sTemp);
end;
AOutput.SaveToFile('C:\Outpt.TXT');
finally
AOutput.Free;
end;
finally
AInput.Free;
end;
end;
jadeluo(秀峰) 的算法比较容易直接实现
Hash 算法最快但比较麻烦
推荐jadeluo(秀峰)
var
myl,mym:TStringList;
mys:string;
myi:integer;
begin
myl:=TStringList.create;
mym:=TStringList.create;
myl.LoadFromFile('input.txt');
myl.Sort;
mys:='';
for myi:=0 to myl.Count-1 do
if mys<>myl.Strings[myi] then
begin
mym.Add(mys);
mys:=myl.Strings[myi];
end;
mym.SaveToFile('out.txt');
myl.Free;
mym.Free;
end;
这个方法效率如何?
别用tstring做,用sql做最快,先建一个表,然后用bcp导入到这个表里,bcp的语法里有去除重复的功能,然后在用bcp的命令给引出来。
tstring处理行数较小的还成,几万行以上,只sort一下就慢死了。
另一个方法:
用存盘的方法(建个临时目录),把一行数据当成一个文件名,放硬盘上存,如果重名了就不存,这样只从头到尾走一次就够了,执行完后,在用dir命令存成一个文件就成了!
把油箱告诉我,我给你解决方法,不用数据库,性能特特别快。
808886@gmail.com
谢谢各位拉
我做了一个算法,10万行查重复,用15M。我的机器是双核3.5G
我做了一个算法,10万行查重复,用15秒。我的机器是双核3.5G,不知道是否达到要求
加上一行Sort,可以提高不少速度。
关于TStringList类的Sort和IndexOf函数,Delphi中提供了源代码的。其中,Sort使用的是快速排序,IndexOf在有Sort的情况下为二分法查找,否则为顺序查找。
效率应该不低的。
procedure TForm1.Button1Click(Sender: TObject);
var
AInput : TStringList;
AOutput : TStringList;
iLoop : Integer;
sTemp : String;
begin
AInput := TStringList.Create;
try
AInput.LoadFromFile('C:\Input.TXT');
AInput.Sort;
AOutput := TStringList.Create;
try
for iLoop := 0 to AInput.Count - 1 do
begin
sTemp := AInput.Strings[iLoop];
if AOutput.IndexOf(sTemp) < 0 then AOutput.Add(sTemp);
end;
AOutput.SaveToFile('C:\Outpt.TXT');
finally
AOutput.Free;
end;
finally
AInput.Free;
end;
end;
我的算法是拿内存换时间。
unit uCheckDup;
interface
uses
Windows, Messages, SysUtils, Variants, Classes, Graphics, Controls,
Dialogs, StdCtrls;
procedure StartCheckDup;
function CheckDup(AStr: string): boolean;
implementation
var
StrListArray: array of TStringList;
const
BufSize = 65536;// 64K
procedure StartCheckDup;
var
I: integer;
begin
SetLength(StrListArray, BufSize);
for I := 0 to BufSize - 1 do
StrListArray[I].Clear;
end;
function CheckDup(AStr: string): boolean;
type
TWordArray = array of word;
var
Key: word;
I, L: integer;
AStrList: TStringList;
begin
Key := 0;
L := length(AStr);
if L = 1 then
Key := Ord(AStr[1])
else
for I := (L shr 1) - 1 downto 0 do
Key := Key + TWordArray(PChar(AStr))[I];
if (L and 1) <> 0 then
Key := Key + Ord(AStr[L]);
AStrList := StrListArray[Key];
if (AStrList.Count = 0) or (AStrList.IndexOf(AStr) < 0) then
begin
AStrList.Append(AStr);
Result := False;
end
else
Result := True;
end;
procedure GenerateArray;
var
I: integer;
begin
SetLength(StrListArray, BufSize);
for I := 0 to BufSize - 1 do
StrListArray[I] := TStringList.Create;
end;
procedure FreeArray;
var
I: integer;
begin
for I := 0 to BufSize - 1 do
FreeAndNil(StrListArray[I]);
end;
initialization
GenerateArray;
finalization
FreeArray;
end.
使用方法:procedure TForm1.Button1Click(Sender: TObject);
var
ATick: DWord;
I: integer;
begin
ATick := GetTickCount;
StartCheckDup;
sl2.Clear;
for I := 0 to sl.Count-1 do
begin
if not CheckDup(sl[I]) then
sl2.Append(sl[I]);
Caption := IntToStr(I);
end;
ShowMessage('Time:' + IntToStr(GetTickCount - ATick)
+ 'ms,Remains:' + IntToStr(sl2.Count));
end;
up
实用算法大讨论,好贴.
//十万行,双核1.6G耗时1.2 秒
var
I: Integer;
vTickCount: Longword;
begin
Randomize; // test
with TStringList.Create do try
//LoadFromFile('input.txt'); //载入文件
for I := 1 to 100000 do Add(IntToStr(Random(MaxInt))); // 产生十万行文本
vTickCount := GetTickCount;
Sort; //排序
for I := Count - 1 downto 0 do
if (I >= 1) and (Strings[I] = Strings[I - 1]) then
Delete(I);
Caption := IntToStr(GetTickCount - vTickCount); // 输出用时
finally
Free;
end;
end;
应该这样,规定字符串的长度为某个固定值比较好。我推荐20。测试数据:
var
I: integer;
begin
Randomize;
sl.BeginUpdate;
sl.Clear;
for I := 0 to 100000 do
begin
sl.Append('test' + format('%.11d', [random(50000)]));
Caption := IntToStr(I);
end;
sl.Sort;
sl.EndUpdate;
我发现了,Caption := IntToStr(I);是个速度杀手,我把它关闭之后,我的算法处理10万条数据速度达到了2秒!好像比zswang(伴水清清)(专家门诊清洁工) 的慢。但是我怀疑zswang(伴水清清)(专家门诊清洁工) 采用的数据应该是具有大致1半重复数据(也就是10万次调用random(50000)),看看性能如何?
破最新纪录:我修改了一下算法,使zswang的for I := 1 to 100000 do Add(IntToStr(Random(MaxInt)))产生的10万条数据在我这里仅用了0.3秒查完重复。针对我自己产生的高重复数据,10万条的处理我用了2.3秒!
搂主感谢我吧。注意:本次算法函数返回值调整了一下,无重复返回true,和以前相反。
unit uCheckDup;
interface
uses
Windows, Messages, SysUtils, Variants, Classes, Graphics, Controls,
Dialogs, StdCtrls;
procedure StartCheckDup;
function CheckDup(AStr: string): boolean;
implementation
const
BufSize = 65536;// 64K
var
StrListArray: array of TStringList;
Crc16Tab: array[0..$FF] of word =
($00000, $01021, $02042, $03063, $04084, $050a5, $060c6, $070e7,
$08108, $09129, $0a14a, $0b16b, $0c18c, $0d1ad, $0e1ce, $0f1ef,
$01231, $00210, $03273, $02252, $052b5, $04294, $072f7, $062d6,
$09339, $08318, $0b37b, $0a35a, $0d3bd, $0c39c, $0f3ff, $0e3de,
$02462, $03443, $00420, $01401, $064e6, $074c7, $044a4, $05485,
$0a56a, $0b54b, $08528, $09509, $0e5ee, $0f5cf, $0c5ac, $0d58d,
$03653, $02672, $01611, $00630, $076d7, $066f6, $05695, $046b4,
$0b75b, $0a77a, $09719, $08738, $0f7df, $0e7fe, $0d79d, $0c7bc,
$048c4, $058e5, $06886, $078a7, $00840, $01861, $02802, $03823,
$0c9cc, $0d9ed, $0e98e, $0f9af, $08948, $09969, $0a90a, $0b92b,
$05af5, $04ad4, $07ab7, $06a96, $01a71, $00a50, $03a33, $02a12,
$0dbfd, $0cbdc, $0fbbf, $0eb9e, $09b79, $08b58, $0bb3b, $0ab1a,
$06ca6, $07c87, $04ce4, $05cc5, $02c22, $03c03, $00c60, $01c41,
$0edae, $0fd8f, $0cdec, $0ddcd, $0ad2a, $0bd0b, $08d68, $09d49,
$07e97, $06eb6, $05ed5, $04ef4, $03e13, $02e32, $01e51, $00e70,
$0ff9f, $0efbe, $0dfdd, $0cffc, $0bf1b, $0af3a, $09f59, $08f78,
$09188, $081a9, $0b1ca, $0a1eb, $0d10c, $0c12d, $0f14e, $0e16f,
$01080, $000a1, $030c2, $020e3, $05004, $04025, $07046, $06067,
$083b9, $09398, $0a3fb, $0b3da, $0c33d, $0d31c, $0e37f, $0f35e,
$002b1, $01290, $022f3, $032d2, $04235, $05214, $06277, $07256,
$0b5ea, $0a5cb, $095a8, $08589, $0f56e, $0e54f, $0d52c, $0c50d,
$034e2, $024c3, $014a0, $00481, $07466, $06447, $05424, $04405,
$0a7db, $0b7fa, $08799, $097b8, $0e75f, $0f77e, $0c71d, $0d73c,
$026d3, $036f2, $00691, $016b0, $06657, $07676, $04615, $05634,
$0d94c, $0c96d, $0f90e, $0e92f, $099c8, $089e9, $0b98a, $0a9ab,
$05844, $04865, $07806, $06827, $018c0, $008e1, $03882, $028a3,
$0cb7d, $0db5c, $0eb3f, $0fb1e, $08bf9, $09bd8, $0abbb, $0bb9a,
$04a75, $05a54, $06a37, $07a16, $00af1, $01ad0, $02ab3, $03a92,
$0fd2e, $0ed0f, $0dd6c, $0cd4d, $0bdaa, $0ad8b, $09de8, $08dc9,
$07c26, $06c07, $05c64, $04c45, $03ca2, $02c83, $01ce0, $00cc1,
$0ef1f, $0ff3e, $0cf5d, $0df7c, $0af9b, $0bfba, $08fd9, $09ff8,
$06e17, $07e36, $04e55, $05e74, $02e93, $03eb2, $00ed1, $01ef0);
function CRCValue(AStr: string): Word;
var
i: integer;
begin
Result := 0;
for i := Length(AStr) downto 1 do
Result := Hi(Result) xor CRC16Tab[byte(AStr[i]) xor Lo(Result)];
end;
procedure StartCheckDup;
var
I: integer;
begin
SetLength(StrListArray, BufSize);
for I := 0 to BufSize - 1 do
StrListArray[I].Clear;
end;
function CheckDup(AStr: string): boolean;
begin
with StrListArray[CRCValue(AStr)] do
begin
Result := (Count = 0) or (IndexOf(AStr) < 0);
if Result then
Append(AStr);
end;
end;
procedure GenerateArray;
var
I: integer;
begin
SetLength(StrListArray, BufSize);
for I := 0 to BufSize - 1 do
StrListArray[I] := TStringList.Create;
end;
procedure FreeArray;
var
I: integer;
begin
for I := 0 to BufSize - 1 do
FreeAndNil(StrListArray[I]);
end;
initialization
GenerateArray;
finalization
FreeArray;
end.
谢谢各位的热情帮助,特别是 yangfl(yangfl) jadeluo(秀峰) zswang(伴水清清)