如何解决C# 编译成程序集为什么运行更少的汇编指令需要更长的时间?
我有一些 C# 代码只有一行不同。
生成的 Asm 有 4 行不同。 1 是重复的但在不同的位置,而 3 被完全删除。
我希望较短的代码运行得更快,但我发现速度明显减慢。
请忽略还有很多东西可以优化的事实。这是我写的一篇文章,我一步一步优化它,我没想到这个中间结果是回归。
这是我放入 Sharplab.io 以获得 Asm 输出的内容。它匹配 BenchmarkDotNet 的 disassembler 产生的:
using System;
using System.Collections.Generic;
using System.Text;
namespace PerformanceTesting
{
public readonly struct SuggestItem
{
public readonly string Term;
public readonly ulong Count;
public SuggestItem(string term,ulong count)
{
Term = term;
Count = count;
}
}
public sealed class CompressedSparseRowGraph
{
public readonly char[] EdgeCharacters;
public readonly uint[] FirstChildEdgeIndex;
public readonly int[] EdgetoNodeIndex;
public readonly int RootNodeIndex;
public readonly ushort[] ReachableTerminalNodes;
public readonly ulong[] WordCounts;
}
public sealed class Dawg
{
private readonly CompressedSparseRowGraph _graph;
public Dawg(CompressedSparseRowGraph graph)
{
_graph = graph;
}
private readonly struct ClosureVariable
{
public ClosureVariable(string word,uint maxEdits,CompressedSparseRowGraph graph,int[][] matrix,StringBuilder builder,List<SuggestItem> results) : this()
{
this.word = word;
this.maxEdits = maxEdits;
this.graph = graph;
this.matrix = matrix;
this.builder = builder;
this.results = results;
}
public readonly string word;
public readonly uint maxEdits;
public readonly CompressedSparseRowGraph graph;
public readonly int[][] matrix;
public readonly StringBuilder builder;
public readonly List<SuggestItem> results;
}
public IEnumerable<SuggestItem> Lookup(string word,uint maxEdits)
{
var builder = new StringBuilder(word.Length + (int)maxEdits);
builder.Append(new string(' ',word.Length + (int)maxEdits));
var results = new List<SuggestItem>();
var matrix = new int[word.Length + maxEdits + 1][];
for (var i = 0; i < matrix.Length; i++)
{
matrix[i] = new int[word.Length + 1];
matrix[i][0] = i - (int)maxEdits;
var stripeEnd = i + maxEdits + 1;
if (stripeEnd <= word.Length)
{
matrix[i][stripeEnd] = 0;
}
}
for (var i = 0; i < matrix[0].Length; i++)
{
matrix[0][i] = i - (int)maxEdits;
}
var closure = new ClosureVariable(word,maxEdits,_graph,matrix,builder,results);
Recurse(_graph.RootNodeIndex,ref closure);
return results;
}
private static void Recurse(int currentNode,int depth,ref ClosureVariable closure)
{
if (depth == closure.word.Length + closure.maxEdits)
{
return;
}
var firstChild = closure.graph.FirstChildEdgeIndex[currentNode];
var lastChild = closure.graph.FirstChildEdgeIndex[currentNode + 1];
var from = depth - (int)closure.maxEdits;
if (from < 0)
{
from = 0;
}
from++;
- var to = Math.Min(closure.word.Length + 1,depth + closure.maxEdits + 2);
+ var to = (long)Math.Min(closure.word.Length + 1,depth + (int)closure.maxEdits + 2);
var prevIoUsCharacter = depth > 0 ? closure.builder[depth - 1] : (char)0;
var prevIoUsRow = closure.matrix[depth];
var currentRow = closure.matrix[depth + 1];
for (var childEdge = firstChild; childEdge < lastChild; childEdge++)
{
var any = false;
var currentCharacter = closure.graph.EdgeCharacters[childEdge];
closure.builder[depth] = currentCharacter;
var calculatedcost = depth + 1;
var prevIoUsRowEntry = prevIoUsRow[from - 1];
var targetCharacter = (char)0;
for (var i = from; i < to; i++)
{
var prevIoUsTargetCharacter = targetCharacter;
targetCharacter = closure.word[i - 1];
var prevIoUsRowPrevIoUsEntry = prevIoUsRowEntry;
prevIoUsRowEntry = prevIoUsRow[i];
if (currentCharacter == targetCharacter)
{
calculatedcost = prevIoUsRowPrevIoUsEntry;
}
else
{
if (prevIoUsRowEntry < calculatedcost)
{
calculatedcost = prevIoUsRowEntry;
}
if (targetCharacter == prevIoUsCharacter
&& prevIoUsTargetCharacter == currentCharacter)
{
prevIoUsRowPrevIoUsEntry = closure.matrix[depth - 1][i - 2];
}
if (prevIoUsRowPrevIoUsEntry < calculatedcost)
{
calculatedcost = prevIoUsRowPrevIoUsEntry;
}
calculatedcost++;
}
if (calculatedcost <= 0)
{
any = true;
}
currentRow[i] = calculatedcost;
}
if (!any)
{
continue;
}
var nextNode = closure.graph.EdgetoNodeIndex[childEdge];
if (nextNode < 0)
{
nextNode = -nextNode;
if (depth >= closure.word.Length - closure.maxEdits - 1
&& calculatedcost <= 0)
{
closure.results.Add(new SuggestItem(closure.builder.ToString(0,depth + 1),0));
}
}
Recurse(nextNode,depth + 1,ref closure);
}
}
}
}
generate assembly中的指令区别如下:
- L0099: movsxd r15,ecx
- L009c: movsxd rcx,edi
- L009f: mov edx,eax
- L00a1: lea r12,[rcx+rdx+2]
+ L0099: lea edx,[rdi+rax+2]
- L00a6: cmp r15,r12
+ L009d: cmp ecx,edx
- L00a9: jle short L00ad
+ L009f: jle short L00a3
- L00ab: jmp short L00b0
+ L00a1: jmp short L00a5
- L00ad: mov r12,r15
+ L00a3: mov edx,ecx
+ L00a5: movsxd r15,edx
所以使用的寄存器发生了变化,比较前只有一个 movxsd 而不是两个。
由于寄存器差异,有一些下游变化,第一个样本在 L0339 完成,而第二个在 L0336 完成。分别比L00b0和L00a5的diff之后的前两条指令的差异要小。
我在使用 zen 2 架构的 ryzen 3800X 上运行它。
我已经阅读了 Agner Fog's optimization manuals 并且没有任何东西让我觉得是放缓的明显原因。也许对齐边界是罪魁祸首,但我不知道如何验证。
1000 次 Lookup 调用的基准差异约为 740 毫秒,而第二个版本则为 770 毫秒。
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。