如何解决C#解析一个非常不稳定的文件
我有一个非常大的文件,在不同的地方多次出现以下行。 这个文件没有真正的组织。但是,重复了以下行:
<mcwb:Control ClassId="jdn8732d" ControldisplayName="Type" ControlId="asp_type" datafieldName="asp_type" IsSystemControl="False" IsUnbound="False" SystemStepType="0">
1. ControldisplayName="Type"
2. ControlId="asp_type"
3. datafieldName="asp_type"
我正在努力用 C# 解析这个文件,有人知道我该怎么做吗?
解决方法
一个非常简单的实现(有更好、更高效、更健壮的完成任务的方法)可以帮助您入门,如果您真的无法使用 xml 库来搜索原生 xml 功能。请记住在适当的情况下添加异常处理和特殊情况以及其他验证。
// usage
private static void Main(string[] args) {
Searcher searcher = new Searcher();
foreach (SearchExtract extract in searcher.GetOccurances(System.IO.File.ReadAllText(@"c:\temp\testfile.txt"))) {
//do somethign with extract
}
}
public class SearchExtract {
// makes it easier to find in the document for checking,can even use to calculate lines
public int StartIndex {
get; set;
}
public string ControlDisplayName {
get; set;
}
public string ControlId {
get; set;
}
public string DataFieldName {
get; set;
}
}
// does not check for end of file
public class Searcher {
private const string Start = "<mcwb:Control "; // with the space
private const string End = ">";
// use a stream as input and a stream reader (buffered) for large files
public System.Collections.Generic.IEnumerable<SearchExtract> GetOccurances(string input) {
int position = 0;
int startIndex = input.IndexOf(Start,position,System.StringComparison.Ordinal);
while (startIndex >= 0) {
// first lets try find the end
// assuming the end string is never in a comment or included string literal value,else you need to accomodate for this
int endIndex = input.IndexOf(End,startIndex);
// now the other values
int controlDisplayNameIndex = input.IndexOf(nameof(SearchExtract.ControlDisplayName),startIndex,System.StringComparison.Ordinal);
int controlIdIndex = input.IndexOf(nameof(SearchExtract.ControlId),System.StringComparison.Ordinal);
int dataFieldNameIndex = input.IndexOf(nameof(SearchExtract.DataFieldName),System.StringComparison.Ordinal);
// now make sure the values are all before the end character
// here the assumption is you need all for it to be a valid search extract
// and that every start has an end .. ie not malformed file in this regard
if (controlDisplayNameIndex < endIndex && controlIdIndex < endIndex && dataFieldNameIndex < endIndex) {
string controlDisplayNameValue = FindValue(controlDisplayNameIndex + nameof(SearchExtract.ControlDisplayName).Length,input);
string controlIdValue = FindValue(controlIdIndex + nameof(SearchExtract.ControlId).Length,input);
string dataFieldNameValue = FindValue(dataFieldNameIndex + nameof(SearchExtract.DataFieldName).Length,input);
// validate
if (controlDisplayNameValue != null && controlIdValue != null && dataFieldNameValue != null) {
// iterate
yield return new SearchExtract() {
StartIndex = startIndex,ControlDisplayName = controlDisplayNameValue,ControlId = controlIdValue,DataFieldName = dataFieldNameValue
};
}
}
startIndex = input.IndexOf(Start,endIndex + 1,System.StringComparison.Ordinal);
}
}
// introduce some additional terminal checks .. such as maximum value length
private string FindValue(int index,string input) {
const string ValueDelimiter = "\"";
string value = null;
int startIndex = input.IndexOf(ValueDelimiter,index,System.StringComparison.Ordinal);
int endIndex = -1;
if (startIndex >= 0) {
endIndex = input.IndexOf(ValueDelimiter,startIndex + 1,System.StringComparison.Ordinal);
}
if (endIndex > 0) {
value = input.Substring(startIndex + 1,endIndex - startIndex - 1);
// do some validation on value
}
return value;
}
}
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。