如何解决有没有更有效的方法来比较两个列表的项目并找到 leftOuterItems、rightOuterItems 和 matchingItems?
我们经常需要一种方法来比较两个列表的项,找出哪些项只存在于ListA(leftOuterItems)中,哪些只存在于ListB(rightOuterItems)中以及它们的共同项(matchedItems)...
我最终得到了两个解决方案,如下所示:
一种方式是对列表进行排序并一个一个地迭代(当集合由于排序而有太多项时会有性能损失),另一种方式是使用字典和散列(比第一种慢当集合有几个项目时的方式 - 由于内存分配等)
*还要记住,我想比较两个对象列表,例如两个 Person 类列表(不仅仅是基元)。这就是我创建通用扩展方法的原因
那么,您有什么更好的建议吗?
先谢谢你!
class Program
{
static void Main(string[] args)
{
var fewItemsList1 = new[] { 1,4,2,3,7,6,9,5 };
var fewItemsList2 = new[] { 15,5,14,13,12,8,11,10 };
Run(100_000,fewItemsList1,fewItemsList2);
var manyItemsList1 = Enumerable.Range(0,100_000).ToArray();
var manyItemsList2 = Enumerable.Range(50000,150_000).ToArray();
Run(1000,manyItemsList1,manyItemsList2);
Console.WriteLine("Hello World!");
Console.Read();
}
private static void Run(int count,int[] l1,int[] l2)
{
var sw = Stopwatch.StartNew();
for (int i = 0; i < count; i++)
l1.OrderedCompare(l2,x => x,out int[] leftOuterItems,out int[] rightOuterItems,out (int,int)[] matchedItems);
sw.Stop();
Console.WriteLine($"OrderedCompare for {count} iterations with L1 items:{l1.Count()} and L2 items:{l2.Count()} took {sw.Elapsed}");
sw.Restart();
for (int i = 0; i < count; i++)
l1.HashedCompare(l2,out int[] leftOuterItems2,out int[] rightOuterItems2,int)[] matchedItems2);
Console.WriteLine($"HashedCompare for {count} with L1 items:{l1.Count()} and L2 items:{l2.Count()} iterations took {sw.Elapsed}");
}
}
public static class Extensions
{
public static void OrderedCompare<T1,T2,TKey>(
this IEnumerable<T1> source,IEnumerable<T2> target,Func<T1,TKey> sourceKeyGetter,Func<T2,TKey> targetKeyGetter,out T1[] leftOuterItems,out T2[] rightOuterItems,out (T1,T2)[] matchedItems) where TKey : IComparable<TKey>
{
var leftOuterItemsList = new List<T1>();
var rightOuterItemsList = new List<T2>();
var matchedItemsList = new List<(T1,T2)>();
source = source.OrderBy(x => sourceKeyGetter(x)).ToArray();
target = target.OrderBy(x => targetKeyGetter(x)).ToArray();
bool reverseCompare = false;
int i = 0,j = 0,sourcZeroBasedCount = source.Count() - 1,targetZeroBaseCount = target.Count() - 1;
while (true)
{
var end = i == sourcZeroBasedCount && j == targetZeroBaseCount;
var sourceItem = source.ElementAt(i);
var targetItem = target.ElementAt(j);
var sourceKey = sourceKeyGetter(sourceItem);
var targetKey = targetKeyGetter(targetItem);
int diff = reverseCompare ? targetKey.Compareto(sourceKey) : sourceKey.Compareto(targetKey);
reverseCompare = i == sourcZeroBasedCount || j == targetZeroBaseCount;
switch (diff)
{
case -1:
leftOuterItemsList.Add(sourceItem);
i = i < sourcZeroBasedCount ? i + 1 : i;
break;
case 0:
matchedItemsList.Add((sourceItem,targetItem));
i = i < sourcZeroBasedCount ? i + 1 : i;
j = j < targetZeroBaseCount ? j + 1 : j;
break;
case 1:
rightOuterItemsList.Add(targetItem);
j = j < targetZeroBaseCount ? j + 1 : j;
break;
}
if (end)
break;
}
leftOuterItems = leftOuterItemsList.ToArray();
rightOuterItems = rightOuterItemsList.ToArray();
matchedItems = matchedItemsList.ToArray();
}
public static void HashedCompare<T1,T2)[] matchedItems) where TKey : IComparable<TKey>
{
var sourceDic = source.ToDictionary(x => sourceKeyGetter(x));
var targetDic = target.ToDictionary(x => targetKeyGetter(x));
var leftOuterKeys = sourceDic.Keys.Except(targetDic.Keys).ToArray();
var rightOuterKeys = targetDic.Keys.Except(sourceDic.Keys).ToArray();
var matchedKeys = sourceDic.Keys.Concat(targetDic.Keys).Except(leftOuterKeys.Concat(rightOuterKeys)).ToArray();
leftOuterItems = leftOuterKeys.Select(key => sourceDic[key]).ToArray();
rightOuterItems = rightOuterKeys.Select(key => targetDic[key]).ToArray();
matchedItems = matchedKeys.Select(key => (sourceDic[key],targetDic[key])).ToArray();
}
}
解决方法
HashedCompare()
中的大部分低效率都归结为在字典中进行了不必要的枚举和查找。如果您以命令式风格编写算法,则可以避免所有这些,并且代码在我看来更易于遵循:
我赞同@00110001 的建议,即您应该使用适当的基准测试框架,因为不同实现之间的差异具有相同的复杂性。
public static void HashedCompare<T1,T2,TKey>(
this IEnumerable<T1> source,IEnumerable<T2> target,Func<T1,TKey> sourceKeyGetter,Func<T2,TKey> targetKeyGetter,out List<T1> leftOuterItems,out List<T2> rightOuterItems,out List<(T1,T2)> matchedItems) where TKey : IEquatable<TKey>
{
var sourceItems = source.ToDictionary(x => sourceKeyGetter(x));
var targetItems = target.ToDictionary(x => targetKeyGetter(x));
matchedItems = new List<(T1,T2)>();
leftOuterItems = new List<T1>();
rightOuterItems = new List<T2>();
foreach (var sourceItem in sourceItems)
{
if (targetItems.TryGetValue(sourceItem.Key,out var targetItem))
matchedItems.Add((sourceItem.Value,targetItem));
else
leftOuterItems.Add(sourceItem.Value);
}
foreach (var targetItem in targetItems)
{
if (!sourceItems.ContainsKey(targetItem.Key))
rightOuterItems.Add(targetItem.Value);
}
}
,
您可以使用 Except
和 Intersect
,它们都适用于 Sets
(轻量级哈希集)并且可以O(n) 线性时间复杂度
var list1 = new[] { 1,4,2,3,7,6,9,5 };
var List2 = new[] { 15,5,14,13,12,8,11,10 };
Console.WriteLine(string.Join(",",list1.Except(List2)));
Console.WriteLine(string.Join(",List2.Except(list1)));
Console.WriteLine(string.Join(",List2.Intersect(list1)));
输出
1,3
15,10
5,9
至于它是更快还是更慢,您必须基准,但我的直觉是它们会更高效、更快。
关于基准测试的话题
使用像BenchmarkDotNet这样可靠的测试框架,如果你自己动手,你很可能会以无限的方式得到错误的结果
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。