使用dotlucene为数据库建立全文索引
CreateIndex.cs
using System;
using System.Collections.Generic;
using System.Text;
using System.IO;
using System.Text.RegularExpressions;
using Lucene.Net.Analysis.Cn;
using Lucene.Net.Documents;
using Lucene.Net.Index;
using System.Data.SqlClient;
namespace YWG_Indexer
{
public class CreateIndex
{
//索引写入器
private IndexWriter writer;
private string _id;
public string id
{
get { return _id; }
}
/// <summary>
/// 初始化一个索引写入器writer,directory为创建索引的目录,true代表如果不存在索引文件将重新创建索引文件,
/// 如果已经存在索引文件将覆写索引文件,如果为true将代表打开已经存在的索引文件
/// </summary>
/// <param name="directory">传入的要创建索引的目录,注意是字符串值,如果目录不存在,他将会被自动创建</param>
/// <param name="boolCreate">传入bool值,是否全部重建索引</param>
public CreateIndex(string directory, bool boolCreate)
{
writer = new IndexWriter(directory, new ChineseAnalyzer(), boolCreate);
writer.SetUseCompoundFile(true);
}
/// <summary>
/// 读数据库索引
/// </summary>
/// <param name="sql">sql语句</param>
/// <param name="sqlconn">数据库连接字符串</param>
/// <param name="indexColumnName">索引记录的id列名</param>
/// <param name="titleColumnName">索引记录的title列名</param>
public void AddHtmlToDocument(string sql, string sqlconn, string indexColumnName, string titleColumnName)
{
SqlConnection conn = new SqlConnection(sqlconn);
SqlCommand cmd = new SqlCommand(sql, conn);
conn.Open();
SqlDataReader sdr = cmd.ExecuteReader(System.Data.CommandBehavior.CloseConnection);
StringBuilder html ;
int icount = sdr.FieldCount;
while (sdr.Read())
{
html = new StringBuilder();
Document doc = new Document();
for (int i = 1; i < icount; i++)
html.Append(sdr[1].ToString());
doc.Add(Field.UnStored("text", html.ToString()));
doc.Add(Field.Keyword("id", sdr[indexColumnName].ToString()));
doc.Add(Field.Text("title", sdr[titleColumnName].ToString() + DateTime.Now.ToString()));
_id = sdr[indexColumnName].ToString();
writer.AddDocument(doc);
}
}
/// <summary>
/// 把读取的文件中的所有的html标记去掉,把 替换成空格
/// </summary>
/// <param name="html"></param>
/// <returns></returns>
private string ParseHtml(string html)
{
string temp = Regex.Replace(html, "<[^>]*>", "");
return temp.Replace(" ", " ");
}
public void Close()
{
writer.Optimize();
writer.Close();
}
}
}
Program.cs
using System;
using System.Collections.Generic;
using System.Text;
using System.Xml;
namespace YWG_Indexer
{
class Program
{
static void Main(string[] args)
{
string sql, sqlconn, id, titleColumnName, indexDirectory, indexColumnName, startID;
sql = sqlconn = id = titleColumnName = indexDirectory = indexColumnName = startID = null;
XmlDocument xdoc = new XmlDocument();
xdoc.Load("indexer.xml");
XmlNodeList xnl = xdoc.SelectSingleNode("body").ChildNodes;
string tableid = "";
//显示xml列表
Console.WriteLine("药王谷数据索引创建程序\n");
foreach (XmlNode xn in xnl)
{
XmlElement bb = (XmlElement)xn;
Console.WriteLine(bb.GetAttribute("id") + " " + bb.GetAttribute("tableName"));
tableid += bb.GetAttribute("id") + ",";
}
Console.WriteLine("请选择要索引的表:");
id = Console.ReadLine();
if (tableid.IndexOf(id) == -1)
{
Console.WriteLine("您选择的是一个不存在的表ID");
return;
}
else
{
//取得选中的xml文件的各列的值
foreach (XmlNode xn in xnl)
{
XmlElement bb = (XmlElement)xn;
if (bb.GetAttribute("id") == id)
{
sqlconn = bb.ChildNodes[0].InnerText;
sql = bb.ChildNodes[1].InnerText;
indexDirectory = bb.ChildNodes[2].InnerText;
titleColumnName = bb.ChildNodes[3].InnerText;
indexColumnName = bb.ChildNodes[4].InnerText;
startID = bb.ChildNodes[5].InnerText;
}
}
}
Console.WriteLine("是否要重建全部索引(全部重建需要20分钟以上,注意大小写),默认为否(Yes/N)");
bool reCreate = Console.ReadLine() == "Yes";
//非重新索引,改变sql语句
if (!reCreate)
sql += " where " + indexColumnName + ">" + startID;
Console.WriteLine("正在创建索引。。。。。。");
DateTime startTime = DateTime.Now;
CreateIndex writer = new CreateIndex(indexDirectory, reCreate);
writer.AddHtmlToDocument(sql, sqlconn, indexColumnName, titleColumnName);
//取得索引后的最大ID写回xml文件
foreach (XmlNode xn in xnl)
{
XmlElement bb = (XmlElement)xn;
if (bb.GetAttribute("id") == id)
{
startID = bb.ChildNodes[5].InnerText = writer.id;
xdoc.Save("indexer.xml");
break;
}
}
Console.WriteLine(writer.id);
writer.Close();
Console.WriteLine("创建索引完毕,共花费时间: " + (DateTime.Now - startTime));
}
//private static void xmlWriter()
//{
// XmlDocument xdoc = new XmlDocument();
// xdoc.Load("indexer.xml");
// XmlElement xe = xdoc.CreateElement("project");
// XmlAttribute xa = xdoc.CreateAttribute("tableName");
// xa.InnerText = "zhaoshang";
// xe.SetAttributeNode(xa);
// XmlElement sqlconn = xdoc.CreateElement("sqlconn");
// sqlconn.InnerText = "serklsajfd;";
// xe.AppendChild(sqlconn);
// XmlElement sql = xdoc.CreateElement("sql");
// sql.InnerText = "serklsajfd;";
// xe.AppendChild(sql);
// xdoc.DocumentElement.AppendChild(xe);
// xdoc.Save("indexer.xml");
//}
}
}
indexer.xml
<?xml version="1.0" encoding="utf-8"?>
<body>
<project id="1" tableName="招商">
<sqlconn>server=.;uid=sa;pwd=liugehao;database=yaowanggu;</sqlconn>
<sql>select Z_ID, Z_Name, Z_LX, Z_ZLH, Z_PZWH, Z_SCCS, Z_JX, Z_GG, Z_CF, Z_YFYL, Z_CPXN, Z_CPSM, Z_ZSQY, Z_DLTJ, Z_TGZC, Z_BZ, Z_DWMC, Z_LXDH, Z_LXR, Z_SJ, Z_CZ, Z_DZYJ, Z_GSWZ, Z_DZ, Z_YB from ywg_zhaoshang</sql>
<indexDirectory>F:\Lucene\SearchDemo\wwwroot\index\zs</indexDirectory>
<titleColumnName>Z_Name</titleColumnName>
<indexColumnName>z_id</indexColumnName>
<startID>2979</startID>
</project>
<project id="2" tableName="代理">
<sqlconn>server=.;uid=sa;pwd=liugehao;database=yaowanggu;</sqlconn>
<sql>select top 1000 DL_ID, DL_MC, DL_LB, DL_DQ, DL_XZ, DL_CDLCP, DL_SM from ywg_daili</sql>
<indexDirectory>F:\Lucene\SearchDemo\wwwroot\index\dl</indexDirectory>
<titleColumnName>DL_MC</titleColumnName>
<indexColumnName>DL_ID</indexColumnName>
<startID>1000</startID>
</project>
</body>
编辑推荐DotLucene搜索引擎文章列表:
全文搜索解决方案:DotLucene搜索引擎之创建索引
http://www.xueit.com/html/2009-02/21_606_00.html
DotLucene搜索引擎之搜索索引Demo
http://www.xueit.com/html/2009-02/21_607_00.html
全文搜索技术:dotLucene中文分词的highlight显示
http://www.xueit.com/html/2009-02/21_608_00.html
Lucene.NET增加中文分词
http://www.xueit.com/html/2009-02/21_609_00.html
全文搜索之Lucene增加中文分词功能方法
http://www.xueit.com/html/2009-02/21_610_00.html
简介下基于.NET的全文索引引擎Lucene.NET
http://www.xueit.com/html/2009-02/21_611_00.html
使用dotlucene为数据库建立全文索引
http://www.xueit.com/html/2009-02/21_612_00.html
使用dotlucene多条件检索数据库
http://www.xueit.com/html/2009-02/21_613_00.html
Lucene中文分词实现方法:基于StopWord分割分词
http://www.xueit.com/html/2009-02/21_614_00.html
dotLucene实现增量索引源代码
http://www.xueit.com/html/2009-02/21_615_00.html