如何解决提高大型结构列表的二进制序列化性能
|| 我有一个在3个整数中包含3d坐标的结构。在一个测试中,我将100万个随机点的List <>放在一起,然后使用二进制序列化到内存流。 内存流约为21 MB-似乎效率很低,因为1000000点* 3坐标* 4字节的内存至少应为11MB 在我的测试台上,它也要花费约3秒的时间。 有任何改善性能和/或尺寸的想法吗? (如果有帮助,我不必保留ISerialzable接口,我可以直接写到内存流中) 编辑-从下面的答案中,我将比较BinaryFormatter,\'Raw \'BinaryWriter和Protobuf进行了序列化对决using System;
using System.Text;
using System.Collections.Generic;
using System.Linq;
using Microsoft.VisualStudio.TestTools.UnitTesting;
using System.Runtime.Serialization;
using System.Runtime.Serialization.Formatters.Binary;
using System.IO;
using ProtoBuf;
namespace asp_heatmap.test
{
[Serializable()] // For .NET BinaryFormatter
[ProtoContract] // For Protobuf
public class Coordinates : ISerializable
{
[Serializable()]
[ProtoContract]
public struct CoOrd
{
public CoOrd(int x,int y,int z)
{
this.x = x;
this.y = y;
this.z = z;
}
[ProtoMember(1)]
public int x;
[ProtoMember(2)]
public int y;
[ProtoMember(3)]
public int z;
}
internal Coordinates()
{
}
[ProtoMember(1)]
public List<CoOrd> Coords = new List<CoOrd>();
public void SetupTestArray()
{
Random r = new Random();
List<CoOrd> coordinates = new List<CoOrd>();
for (int i = 0; i < 1000000; i++)
{
Coords.Add(new CoOrd(r.Next(),r.Next(),r.Next()));
}
}
#region Using Framework Binary Formatter Serialization
void ISerializable.GetobjectData(SerializationInfo info,StreamingContext context)
{
info.AddValue(\"Coords\",this.Coords);
}
internal Coordinates(SerializationInfo info,StreamingContext context)
{
this.Coords = (List<CoOrd>)info.GetValue(\"Coords\",typeof(List<CoOrd>));
}
#endregion
# region \'Raw\' Binary Writer serialization
public MemoryStream RawSerializetoStream()
{
MemoryStream stream = new MemoryStream(Coords.Count * 3 * 4 + 4);
BinaryWriter writer = new BinaryWriter(stream);
writer.Write(Coords.Count);
foreach (CoOrd point in Coords)
{
writer.Write(point.x);
writer.Write(point.y);
writer.Write(point.z);
}
return stream;
}
public Coordinates(MemoryStream stream)
{
using (BinaryReader reader = new BinaryReader(stream))
{
int count = reader.ReadInt32();
Coords = new List<CoOrd>(count);
for (int i = 0; i < count; i++)
{
Coords.Add(new CoOrd(reader.ReadInt32(),reader.ReadInt32(),reader.ReadInt32()));
}
}
}
#endregion
}
[TestClass]
public class SerializationTest
{
[TestMethod]
public void TestBinaryFormatter()
{
Coordinates c = new Coordinates();
c.SetupTestArray();
// Serialize to memory stream
MemoryStream mStream = new MemoryStream();
BinaryFormatter bformatter = new BinaryFormatter();
bformatter.Serialize(mStream,c);
Console.WriteLine(\"Length : {0}\",mStream.Length);
// Now Deserialize
mStream.Position = 0;
Coordinates c2 = (Coordinates)bformatter.Deserialize(mStream);
Console.Write(c2.Coords.Count);
mStream.Close();
}
[TestMethod]
public void TestBinaryWriter()
{
Coordinates c = new Coordinates();
c.SetupTestArray();
MemoryStream mStream = c.RawSerializetoStream();
Console.WriteLine(\"Length : {0}\",mStream.Length);
// Now Deserialize
mStream.Position = 0;
Coordinates c2 = new Coordinates(mStream);
Console.Write(c2.Coords.Count);
}
[TestMethod]
public void TestProtoBufV2()
{
Coordinates c = new Coordinates();
c.SetupTestArray();
MemoryStream mStream = new MemoryStream();
ProtoBuf.Serializer.Serialize(mStream,mStream.Length);
mStream.Position = 0;
Coordinates c2 = ProtoBuf.Serializer.Deserialize<Coordinates>(mStream);
Console.Write(c2.Coords.Count);
}
}
}
结果(Note PB v2.0.0.423 beta)
Serialize | Ser + Deserialize | Size
-----------------------------------------------------------
BinaryFormatter 2.89s | 26.00s !!! | 21.0 MB
ProtoBuf v2 0.52s | 0.83s | 18.7 MB
Raw BinaryWriter 0.27s | 0.36s | 11.4 MB
显然,这只是考虑速度/大小,而没有考虑其他任何因素。
解决方法
使用“ 2”的二进制序列化在其生成的字节中包含类型信息。这会占用更多空间。例如,在您不知道另一端需要什么样的数据结构的情况下,这很有用。
在您的情况下,您知道数据两端都有什么格式,而且听起来好像不会改变。因此,您可以编写一个简单的编码和解码方法。您的CoOrd类也不再需要可序列化。
我将使用System.IO.BinaryReader和System.IO.BinaryWriter,然后遍历每个CoOrd实例,并将X,Y,Z属性值读/写到流中。假设您的许多数字都小于0x7F和0x7FFF,这些类甚至会将int打包到11MB以下。
像这样:
using (var writer = new BinaryWriter(stream)) {
// write the number of items so we know how many to read out
writer.Write(points.Count);
// write three ints per point
foreach (var point in points) {
writer.Write(point.X);
writer.Write(point.Y);
writer.Write(point.Z);
}
}
要从流中读取:
List<CoOrd> points;
using (var reader = new BinaryReader(stream)) {
var count = reader.ReadInt32();
points = new List<CoOrd>(count);
for (int i = 0; i < count; i++) {
var x = reader.ReadInt32();
var y = reader.ReadInt32();
var z = reader.ReadInt32();
points.Add(new CoOrd(x,y,z));
}
}
, 为了简化使用预构建的序列化程序,我建议使用protobuf-net。这是protobuf-net v2,仅添加了一些属性:
[DataContract]
public class Coordinates
{
[DataContract]
public struct CoOrd
{
public CoOrd(int x,int y,int z)
{
this.x = x;
this.y = y;
this.z = z;
}
[DataMember(Order = 1)]
int x;
[DataMember(Order = 2)]
int y;
[DataMember(Order = 3)]
int z;
}
[DataMember(Order = 1)]
public List<CoOrd> Coords = new List<CoOrd>();
public void SetupTestArray()
{
Random r = new Random(123456);
List<CoOrd> coordinates = new List<CoOrd>();
for (int i = 0; i < 1000000; i++)
{
Coords.Add(new CoOrd(r.Next(10000),r.Next(10000),r.Next(10000)));
}
}
}
使用:
ProtoBuf.Serializer.Serialize(mStream,c);
进行序列化。这需要10,960,823字节,但请注意,我对SetupTestArray进行了调整,以将大小限制为10,000,因为默认情况下,它对整数使用\“ varint \”编码,具体取决于大小。 10k在这里并不重要(事实上,我没有检查“步骤”是什么)。如果您希望使用固定大小(允许任意范围):
[ProtoMember(1,DataFormat = DataFormat.FixedSize)]
int x;
[ProtoMember(2,DataFormat = DataFormat.FixedSize)]
int y;
[ProtoMember(3,DataFormat = DataFormat.FixedSize)]
int z;
占用16,998,640字节
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。