C#如何读取Txt大数据并更新到数据库详解
作者:望着天的蜗牛 时间:2024-01-24 18:51:11
环境
Sqlserver 2016
.net 4.5.2
目前测试数据1300万 大约3-4分钟.(限制一次读取条数 和 线程数是 要节省服务器资源,如果调太大服务器其它应用可能就跑不了了), SqlServerDBHelper为数据库帮助类.没有什么特别的处理. 配置连接串时记录把连接池开起来
另外.以下代码中每次写都创建了连接 .之前试过一个连接反复用. 130次大约有20多次 数据库会出问题.并且需要的时间是7-8分钟 左右.
配置文件: xxx.json
[ {
/*连接字符串 */
"ConnStr": "",
"FilePath": "读取的文件地址",
/*数据库表名称 */
"TableName": "写入的数据库表名",
/*导入前执行的语句 */
"ExecBeforeSql": "",
/*导入后执行的语句 */
"ExecAfterSql": "",
/*映射关系 */
"Mapping": [
{
"DBName": "XXX",
"TxtName": "DDD"
}
],
/*过滤数据的正则 当前只实现了小数据一次性读完的检查*/
"FilterRegex": [],
/*检查数据合法性(从数据库获取字段属性进行验证) */
"CheckData": false,
/*列分隔符*/
"Separator": "\t",
/*表头的行数*/
"HeaderRowsNum": 1
}
]
读取代码 : 注意 ConfigurationManager.AppSettings["frpage"] 和 ConfigurationManager.AppSettings["fr"]
需要自己配置好
//读取配置文件信息
List<dynamic> dt = JsonConvert.DeserializeObject<List<dynamic>>(File.ReadAllText(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "config\\ImportTxt.json")));
LogUtil.Info("开始读取txt数据,读取配置:" + dt.Count + "条");
if (dt.Count == 0)
{
return;
}
List<Task> li = new List<Task>();
foreach (dynamic row in dt)
{
LogUtil.Info("开始处理数据:" + JsonConvert.SerializeObject(row));
li.Add(ProcessRow(row));
}
Task.WaitAll(li.ToArray());
LogUtil.Info("数据读取完毕");
public async Task ProcessRow(dynamic row)
{
await Task.Run(() =>
{
AutoResetEvent AE = new AutoResetEvent(false);
DataTable Data = null;
string error = "", ConnStr, TableName, ExecBeforeSql, ExecAfterSql;
Boolean IsCheck = Convert.ToBoolean(row["CheckData"]);
TableName = Convert.ToString(row.TableName);
ConnStr = Convert.ToString(row.ConnStr);
ExecBeforeSql = Convert.ToString(row.ExecBeforeSql);
ExecAfterSql = Convert.ToString(row.ExecAfterSql);
int HeaderRowsNum = Convert.ToInt32(row.HeaderRowsNum);
string Separator = Convert.ToString(row.Separator);
Dictionary<string, string> dic = new Dictionary<string, string>();
//文件达到多大时就分行读取
int fr = 0;
if (!int.TryParse(ConfigurationManager.AppSettings["fr"], out fr))
{
fr = 100;
}
fr = fr * 1024 * 1024;
//分行读取一次读取多少
int page = 0;
if (!int.TryParse(ConfigurationManager.AppSettings["frpage"], out page))
{
page = 50000;
}
foreach (var dyn in row.Mapping)
{
dic.Add(Convert.ToString(dyn.TxtName), Convert.ToString(dyn.DBName));
}
List<string> regex = new List<string>();
foreach (string item in row["FilterRegex"])
{
regex.Add(item);
}
string fpath = "", cpath = "";
cpath = Convert.ToString(row["FilePath"]);
string rootPath = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "tmp");
if (!Directory.Exists(rootPath))
{
Directory.CreateDirectory(rootPath);
}
fpath = Path.Combine(rootPath, Path.GetFileName(cpath));
File.Copy(cpath, fpath, true);
LogUtil.Info("拷文件到本地已经完成.从本地读取数据操作");
int threadCount = Environment.ProcessorCount * 3;
FileInfo fi = new FileInfo(fpath);
//如果文件大于100M就需要分批读取.一次50万条
if (fi.Length > fr)
{
long sumCount = 0;
StreamReader sr = new StreamReader(fi.OpenRead());
int headRow = 0;
string rowstr = "";
List<Thread> li_th = new List<Thread>();
bool last = false;
int ij = 0;
LogUtil.Info("生成StreamReader成功 ");
#region 逐行读取
while (sr.Peek() > -1)
{
rowstr = sr.ReadLine();
#region 将行数据写入DataTable
if (headRow < HeaderRowsNum)
{
Data = new DataTable();
foreach (string scol in rowstr.Split(new string[] { Separator }, StringSplitOptions.RemoveEmptyEntries))
{
Data.Columns.Add(scol.Trim(), typeof(string));
}
headRow++;
continue;
}
else
{ //行数据
if (headRow > 1)
{
for (int i = 1; i < headRow && sr.Peek() > -1; i++)
{
rowstr += " " + sr.ReadLine();
}
}
Data.Rows.Add(rowstr.Split(new string[] { Separator }, StringSplitOptions.RemoveEmptyEntries));
if (Data.Rows.Count < page && sr.Peek() > -1)
{
continue;
}
}
last = (sr.Peek() == -1);
#endregion
sumCount += Data.Rows.Count;
ProcessPath(Data, page, sr, ref ij, TableName, ExecBeforeSql, ExecAfterSql, dic, IsCheck, li_th);
#region 检查线程等待
if ((ij > 0 && (ij % threadCount) == 0) || last)
{
LogUtil.Info("完成一批次当前共写数据: " + sumCount);
while (true)
{
bool isok = true;
foreach (var item in li_th)
{
if (item.IsAlive)
{
isok = false;
Application.DoEvents();
Thread.Sleep(1000);
}
}
if (isok)
{
li_th.Clear();
break;
}
}
//最后一页要等所有的执行完才能执行
if (sr.Peek() == -1)
{
WriteTODB(TableName, Data, ExecBeforeSql, ExecAfterSql, dic, false, true);
LogUtil.Info("最后一次写入完成");
}
LogUtil.Info(" 线程退出开始新的循环...");
}
Data.Clear();
#endregion
}
sr.Dispose();
#endregion
}
else
{
using (SQLServerDBHelper sdb = new SQLServerDBHelper())
{
sdb.OpenConnection();
#region 一次性读取处理
Data = LoadDataTableFromTxt(fpath, ref error, Separator, HeaderRowsNum, regex, IsCheck, dic, TableName);
if (IsCheck)
{
DataRow[] rows = Data.Select("ErrorMsg is not null");
if (rows.Length > 0)
{
LogUtil.Info($"读取{TableName} 数据出错 : {JsonConvert.SerializeObject(rows)}");
return;
}
}
LogUtil.Info($"读取{TableName} 的txt数据完成.共读取数据:{Data.Rows.Count}条");
if (Data.Rows.Count == 0 || !string.IsNullOrWhiteSpace(error))
{
if (!string.IsNullOrWhiteSpace(error))
{
LogUtil.Info("读取数据出错,地址:" + Convert.ToString(row["FilePath"]) + " \r\n 错误:" + error);
}
return;
}
sdb.BgeinTransaction();
try
{
WriteTODB(TableName, Data, ExecBeforeSql, ExecAfterSql, dic, sdb: sdb);
sdb.CommitTransaction();
LogUtil.Info(TableName + "数据更新完毕 !!");
}
catch (Exception ex)
{
LogUtil.Info(TableName + " 更新数据出错,错误:" + ex.Message + " \r\n 堆栈:" + ex.StackTrace);
sdb.RollbackTransaction();
}
#endregion
}
}
GC.Collect();
});
}
private void ProcessPath(DataTable Data, int page, StreamReader sr, ref int ij, string TableName, string ExecBeforeSql, string ExecAfterSql, Dictionary<string, string> dic, bool IsCheck, List<Thread> li_th)
{
int threadCount = Environment.ProcessorCount * 4;
string error = "";
PoolModel p = new PoolModel { TableName = TableName, ExecBeforeSql = ExecBeforeSql, ExecAfterSql = ExecAfterSql, dic = dic };
p.Data = Data.Copy();
if (IsCheck)
{
using (SQLServerDBHelper sdb = new SQLServerDBHelper())
{
error = CheckData(Data, TableName, dic, sdb);
}
DataRow[] rows = Data.Select("ErrorMsg is not null");
if (rows.Length > 0 || !string.IsNullOrWhiteSpace(error))
{
LogUtil.Info($"读取{TableName} 数据出错 : {JsonConvert.SerializeObject(rows)}\r\n错误: " + error);
return;
}
}
ij++;
if (ij == 1)
{
WriteTODB(p.TableName, p.Data, p.ExecBeforeSql, p.ExecAfterSql, p.dic, true, false);
LogUtil.Info("首次写入完成");
}
else if (sr.Peek() > -1)
{
Thread t = new Thread(d =>
{
PoolModel c = d as PoolModel;
try
{
WriteTODB(c.TableName, c.Data, c.ExecBeforeSql, c.ExecAfterSql, c.dic, false, false);
}
catch (ThreadAbortException)
{
LogUtil.Error("线程退出.................");
}
catch (Exception ex)
{
LogUtil.Error(c.TableName + "写入数据失败:" + ex.Message + "\r\n堆栈:" + ex.StackTrace + "\r\n 数据: " + JsonConvert.SerializeObject(c.Data));
ExitApp();
return;
}
});
t.IsBackground = true;
t.Start(p);
li_th.Add(t);
}
}
public void ExitApp()
{
Application.Exit();
}
public void WriteTODB(string TableName, DataTable Data, string ExecBeforeSql, string ExecAfterSql, Dictionary<string, string> dic, bool first = true, bool last = true, SQLServerDBHelper sdb = null)
{
bool have = false;
if (sdb == null)
{
sdb = new SQLServerDBHelper();
have = true;
}
if (first && !string.IsNullOrWhiteSpace(ExecBeforeSql))
{
LogUtil.Info(TableName + "执行前Sql :" + ExecBeforeSql);
sdb.ExecuteNonQuery(ExecBeforeSql);
}
sdb.BulkCopy(Data, TableName, dic);
if (last && !string.IsNullOrWhiteSpace(ExecAfterSql))
{
LogUtil.Info(TableName + "执行后Sql :" + ExecAfterSql);
sdb.ExecuteNonQuery(ExecAfterSql);
}
LogUtil.Info(TableName + "本次执行完成 ");
if (have)
{
sdb.Dispose();
}
}
public string CheckData(DataTable dt, string dbTableName, Dictionary<string, string> dic, SQLServerDBHelper sdb)
{
if (string.IsNullOrWhiteSpace(dbTableName))
{
return "表名不能为空!";
}
if (dic.Count == 0)
{
return "映射关系数据不存在!";
}
List<string> errorMsg = new List<string>();
List<string> Cols = new List<string>();
dic.Foreach(c =>
{
if (!dt.Columns.Contains(c.Key))
{
errorMsg.Add(c.Key);
}
Cols.Add(c.Key);
});
if (errorMsg.Count > 0)
{
return "数据列不完整,请与映射表的数据列数量保持一致!列:" + string.Join(",", errorMsg);
}
//如果行数据有错误信息则添加到这一列的值里
dt.Columns.Add(new DataColumn("ErrorMsg", typeof(string)) { DefaultValue = "" });
string sql = @"--获取SqlServer中表结构
SELECT syscolumns.name as ColName,systypes.name as DBType,syscolumns.isnullable,
syscolumns.length
FROM syscolumns, systypes
WHERE syscolumns.xusertype = systypes.xusertype
AND syscolumns.id = object_id(@tb) ; ";
DataSet ds = sdb.GetDataSet(sql, new SqlParameter[] { new SqlParameter("@tb", dbTableName) });
EnumerableRowCollection<DataRow> TableDef = ds.Tables[0].AsEnumerable();
// string colName="";
Object obj_val;
//将表结构数据重组成字典.
var dic_Def = TableDef.ToDictionary(c => Convert.ToString(c["ColName"]), d =>
{
string DBType = "";
string old = Convert.ToString(d["DBType"]).ToUpper();
DBType = GetCSharpType(old);
return new { ColName = Convert.ToString(d["ColName"]), DBType = DBType, SqlType = old, IsNullble = Convert.ToBoolean(d["isnullable"]), Length = Convert.ToInt32(d["length"]) };
});
DateTime now = DateTime.Now;
foreach (DataRow row in dt.Rows)
{
errorMsg.Clear();
foreach (string colName in Cols)
{
if (dic.ContainsKey(colName))
{
if (!dic_Def.ContainsKey(dic[colName]))
{
return "Excel列名:" + colName + " 映射数据表字段:" + dic[colName] + "在当前数据表中不存在!";
}
//去掉数据两边的空格
row[colName] = obj_val = Convert.ToString(row[colName]).Trim();
var info = dic_Def[dic[colName]];
//是否是DBNULL
if (obj_val.Equals(DBNull.Value))
{
if (!info.IsNullble)
{
errorMsg.Add("列" + colName + "不能为空!");
}
}
else
{
if (info.DBType == "String")
{
//time类型不用验证长度(日期的 时间部分如 17:12:30.0000)
if (info.SqlType == "TIME")
{
if (!DateTime.TryParse(now.ToString("yyyy-MM-dd") + " " + obj_val.ToString(), out now))
{
errorMsg.Add("列" + colName + "填写的数据无效应为日期的时间部分如:17:30:12");
}
}
else if (Convert.ToString(obj_val).Length > info.Length)
{
errorMsg.Add("列" + colName + "长度超过配置长度:" + info.Length);
}
}
else
{
Type t = Type.GetType("System." + info.DBType);
try
{ //如果数字中有千分位在这一步可以处理掉重新给这个列赋上正确的数值
row[colName] = Convert.ChangeType(obj_val, t); ;
}
catch (Exception ex)
{
errorMsg.Add("列" + colName + "填写的数据" + obj_val + "无效应为" + info.SqlType + "类型.");
}
}
}
}
}
row["ErrorMsg"] = string.Join(" || ", errorMsg);
}
return "";
}
/// <summary>
/// wm 2018年11月28日13:37
/// 将数据库常用类型转为C# 中的类名(.Net的类型名)
/// </summary>
/// <param name="old"></param>
/// <returns></returns>
private string GetCSharpType(string old)
{
string DBType = "";
switch (old)
{
case "INT":
case "BIGINT":
case "SMALLINT":
DBType = "Int32";
break;
case "DECIMAL":
case "FLOAT":
case "NUMERIC":
DBType = "Decimal";
break;
case "BIT":
DBType = "Boolean";
break;
case "TEXT":
case "CHAR":
case "NCHAR":
case "VARCHAR":
case "NVARCHAR":
case "TIME":
DBType = "String";
break;
case "DATE":
case "DATETIME":
DBType = "DateTime";
break;
default:
throw new Exception("GetCSharpType数据类型" + DBType + "无法识别!");
}
return DBType;
}
public class PoolModel
{
public string TableName { get; set; }
public DataTable Data { get; set; }
public string ExecBeforeSql { get; set; }
public string ExecAfterSql { get; set; }
public Dictionary<string, string> dic { get; set; }
}
/// <summary>
/// wm 2018年11月28日13:32
/// 获取Txt数据并对数据进行校验返回一个带有ErrorMsg列的DataTable,如果数据校验失败则该字段存放失败的原因
/// 注意:在使用该方法前需要数据表应该已经存在
/// </summary>
/// <param name="isCheck">是否校验数据合法性(数据需要校验则会按传入的dbTableName获取数据库表的结构出来验证)</param>
/// <param name="map">如果需要验证数据则此处需要传映射关系 key Excel列名,Value 数据库列名</param>
/// <param name="dbTableName">验证数据合法性的表(即数据会插入到的表)</param>
/// <param name="error">非数据验证上的异常返回</param>
/// <param name="Regexs">用来过滤数据的正则</param>
/// <param name="path">读取文件的路径</param>
/// <param name="Separator">列分隔符</param>
/// <param name="HeaderRowsNum">表头的行数</param>
/// <returns>如果需求验证则返回一个带有ErrorMsg列的DataTable,如果数据校验失败则该字段存放失败的原因, 不需要验证则数据读取后直接返回DataTable</returns>
public DataTable LoadDataTableFromTxt(string path, ref string error, string Separator, int HeaderRowsNum, List<string> Regexs = null, bool isCheck = false, Dictionary<string, string> map = null, string dbTableName = "", SQLServerDBHelper sdb = null)
{
DataTable dt = new DataTable();
error = "";
if (isCheck && (map == null || map.Count == 0 || string.IsNullOrWhiteSpace(dbTableName)))
{
error = "参数标明需要对表格数据进行校验,但没有指定映射表集合或数据表名.";
return dt;
}
string txts = File.ReadAllText(path);
#region 把读出来的方便数据转成DataTable
Regexs?.ForEach(c =>
{
txts = new Regex(c).Replace(txts, "");
});
////替换掉多表的正则
//Regex mu_re = new Regex(@"\+[-+]{4,}\s+\+[-+\s|\w./]{4,}\+"); //FTP new Regex(@"\+[-+]{4,}\s+\+[-+\s|\w./]{4,}\+"); //原来以-分隔的 new Regex(@"-{5,}(\s)+-{5,}\s+\|.+(\s)?\|.+(\s)?\|-{5,}");
////去掉所有横线
//Regex mu_r = new Regex(@"[+-]{4,}"); //FTP new Regex(@"[+-]{4,}"); //原 new Regex(@"(\|-{5,})|(-{5,})");
//string s1 = mu_re.Replace(txts, "");
//string s2 = mu_r.Replace(s1, "");
// string[] tts = s2.Split(new string[] { "\r\n" }, StringSplitOptions.None);
string[] tts = txts.Split(new string[] { "\r\n" }, StringSplitOptions.None);
string[] vals;
string s1;
//生成表头默认第一行时表头直到遇到第一个只有一个|的内容为止(有几行表头,下面的内容就会有几行)
int headerNum = -1;//记录表头有几列
DataRow dr;
//处理col重复的问题,如果有重复按第几个来命名 比如 A1 A2
Dictionary<string, int> col_Rep = new Dictionary<string, int>();
string colName = "";
bool isre = false;//记录当前是否有重复列
int empty_HeaderRow = 0;
for (int i = 0; i < tts.Length; i++)
{
s1 = tts[i];
//还未获取出表头
if (headerNum < HeaderRowsNum)
{
vals = s1.Split(new string[] { Separator }, StringSplitOptions.RemoveEmptyEntries);
foreach (string col in vals)
{
colName = col.Trim();
if (col_Rep.Keys.Contains(colName))
{
col_Rep[colName]++;
isre = true;
//重复列处理
//colName += col_Rep[colName];
continue;
}
else
{
col_Rep.Add(colName, 1);
}
dt.Columns.Add(colName, typeof(string));
}
headerNum = (i == (HeaderRowsNum - 1)) ? HeaderRowsNum : 0;
}
else
{
if (string.IsNullOrWhiteSpace(s1.Trim()) || string.IsNullOrWhiteSpace(s1.Replace(Separator, "")))
{
continue;
}
if (isre)
{
error = "列:" + string.Join(",", col_Rep.Where(c => c.Value > 1).Select(c => c.Key)) + "存在重复";
return dt;
}
//多行时把多行的数据加在一起处理
if (headerNum > 1)
{
for (int j = 1; j < headerNum && (i + j) < tts.Length; j++)
{
//数据第一行最后没有| 如果没数据则直接换行了所以这里补一个空格防止数据被当空数据移除了
s1 += " " + tts[i + j];
}
}
vals = s1.Split(new string[] { Separator }, StringSplitOptions.RemoveEmptyEntries);
dr = dt.NewRow();
dr.ItemArray = vals;
dt.Rows.Add(dr);
//因为本次循环结束上面会去++ 所以这里只加headerNum-1次
i += (headerNum - 1);
}
}
#endregion
if (isCheck)
{
//dt.Columns.Remove("Item");
//dt.Columns["Item1"].ColumnName = "Item";
//dt.Columns.RemoveAt(dt.Columns.Count - 2);
error = CheckData(dt, dbTableName, map, sdb);
}
return dt;
}
来源:https://www.cnblogs.com/itstac/p/11381432.html
标签:c#,txt,大数据
0
投稿
猜你喜欢
Python中Django与Echarts的结合用法图文详解
2022-02-09 23:10:29
Python使用正则表达式实现爬虫数据抽取
2021-01-22 15:00:24
Web开发人员常用速查手册 英文集合推荐
2023-03-27 22:22:03
Yahoo!网站性能最佳体验的34条黄金守则——图片、Coockie与移动应用
2008-05-29 13:44:00
php实现贪吃蛇小游戏
2024-05-02 17:16:36
Python selenium把歌词评论做成词云图
2022-12-29 10:17:35
asp 实现检测字符串是否为纯字母和数字组合的函数
2009-10-04 20:39:00
JavaScript转换与解析JSON方法实例详解
2024-04-17 10:22:44
python 实现Flask中返回图片流给前端展示
2023-12-01 06:58:23
用Python中的字典来处理索引统计的方法
2022-05-28 19:43:45
windowns使用PySpark环境配置和基本操作
2021-04-12 06:43:06
关于Python中*args和**kwargs的深入理解
2021-04-07 17:45:06
MySQL执行时间的查询
2024-01-14 13:54:25
Golang初始化MySQL数据库方法浅析
2024-01-16 23:49:20
Python实现邮件发送功能的示例详解
2022-08-18 13:15:43
Linux 修改Python命令的方法示例
2023-10-04 18:16:58
Python入门篇之字典
2022-09-24 23:53:47
Python人工智能之混合高斯模型运动目标检测详解分析
2021-11-01 21:28:43
python3操作mysql数据库的方法
2023-07-28 15:48:27
ASP Crazy 模版操作类(最简单的模板类、仅提供交流)
2011-03-17 11:06:00