前一段时间写的aurora,用来统计文本文件里单词的数量的。
源代码如下。
//// console版
#include "cai.h"
#include <iostream>
using namespace std;
// cai.exe -text file_name -delimiter file_name -skip file_name -dup file_name -sort freq
main(int argc, char* argv[])
{
ios::sync_with_stdio(false);
error err = OK;
cmd_argument cmd;
cmd = parse_cmd_argument(argc, argv, err);
cout << "In memeory of Aurora(1984)" << endl;
if (err!=OK)
{
cout << "cmd arguments parsing error." << endl;
cout << "cai.exe -text file_name -delimiter file_name -skip file_name -dup file_name -sort freq|alph" << endl;
}
input_text(cmd.text_file_name, err);
get_skip_table(cmd.skip_table_file_name, err);
get_dup_table(cmd.dup_table_file_name, err);
get_delimiter_table(cmd.delimiter_table_file_name, err);
segement();
remove_skip();
remove_dup();
output_vocabulary(cmd.sort);
}
/////
/// cai.h for console version.
/////
#include <vector>
#include <string>
#include <map>
#include <iostream>
#include <fstream>
#include <sstream>
#include <algorithm>
#include <cctype>
using namespace std;
struct cmd_argument
{
string text_file_name;
string delimiter_table_file_name;
string skip_table_file_name;
string dup_table_file_name;
string sort;
};
vector<string> skip_table;
vector<char> delimiter_table;
map<string, string> dup_table;//varint(meaning) for key, root(mean) for value
string whole_text;
vector<string> text_segmented;
vector<string> text_skip_removed;
vector<string> text_dup_removed;
map<string, int> voc_output;
enum error
{
OK,
cmd_pasrsing_error
};
// cai.exe -text file_name -delimiter file_name -skip file_name -dup file_name -sort freq
cmd_argument parse_cmd_argument(int argc, char* argv[], error& err)
{
cmd_argument cmd;
if (argc != 11)
{
err = cmd_pasrsing_error;
}
else
{
cmd.text_file_name = argv[2];
cmd.delimiter_table_file_name = argv[4];
cmd.skip_table_file_name = argv[6];
cmd.dup_table_file_name = argv[8];
cmd.sort = argv[10];
}
return cmd;
}
void input_text(string file_name, error& err)
{
// cout << "input:" << file_name << endl << endl;
string line;
int line_count = 0;
ifstream f(file_name.c_str());
while(getline(f, line))
{
whole_text += line+" ";
++line_count;
}
string foo = "Some Mixed Case Text";
transform(whole_text.begin(), whole_text.end(), whole_text.begin(), ::tolower);
// cout << line_count << " lines" << endl;
}
void get_skip_table(string file_name, error& err)
{
// cout << "skip:" << file_name << endl << endl;
string line;
ifstream f(file_name.c_str());
while(getline(f, line))
{
skip_table.push_back(line);
}
}
void get_dup_table(string file_name, error& err)
{
// cout << "dup:" << file_name << endl << endl;
string line;
ifstream f(file_name.c_str());
stringstream ss;
string variant;
string root;
string word;
// cout << "---" << endl;
while(getline(f, line))
{
ss.clear();
ss << line;
for (int i = 0; ss >> word; ++i)
{
if (i==0)
{
root = word;
}
else
{
variant = word;
dup_table[variant]=root;
}
}
}
}
void get_delimiter_table(string file_name, error& err)
{
// cout << "delimiter:" << file_name << endl << endl;
char space;
ifstream f(file_name.c_str());
while(f >> space)
{
delimiter_table.push_back(space);
}
}
// whole_text is fileted with delimiter talbe into text_segmented
void segement()
{
// cout << "skip:" ;
/* for (vector<string>::iterator i = skip_table.begin(); i!=skip_table.end() ; ++i) */
/* { */
/* cout << *i << "\t"; */
/* } */
// cout << endl << endl;
// cout << "dup:" << endl;
/* for (map<string, string>::iterator i = dup_table.begin(); i!=dup_table.end() ; ++i) */
/* { */
/* cout << i->first << "\t" << i->second << endl; */
/* } */
// cout << endl << endl;
/* cout << "delimiter:" << endl; */
/* for (vector<char>::iterator i = delimiter_table.begin(); i!=delimiter_table.end() ; ++i) */
/* { */
/* cout << *i << "\t"; */
/* } */
/* cout << endl << endl; */
///
for (vector<char>::iterator i=delimiter_table.begin(); i != delimiter_table.end(); ++i)
{
replace(whole_text.begin(), whole_text.end(), *i, ' ');
}
stringstream ss;
ss << whole_text;
string word;
while(getline(ss, word, ' '))
{
if (word!="")
{
text_segmented.push_back(word);
}
}
/* cout << "----------" << endl; */
/* for (vector<string>::iterator i=text_segmented.begin(); i != text_segmented.end(); ++i) */
/* { */
/* cout << *i << endl; */
/* } */
}
void remove_skip()
{
bool skip = false;
for (vector<string>::iterator i = text_segmented.begin(); i != text_segmented.end(); ++i)
{
for (vector<string>::iterator j = skip_table.begin(); j != skip_table.end(); ++j)
{
if (*i == *j)
{
skip = true;
break;
}
}
if (skip==false)
{
text_skip_removed.push_back(*i);
}
else
{
skip=false;
}
}
/* for (vector<string>::iterator i=text_skip_removed.begin(); i != text_skip_removed.end(); ++i) */
/* { */
/* cout << *i << endl; */
/* } */
}
void remove_dup()
{
for (map<string, string>::iterator j = dup_table.begin(); j != dup_table.end(); ++j)
{
replace(text_skip_removed.begin(), text_skip_removed.end(), j->first, j->second);
}
text_dup_removed = text_skip_removed;
/* for (vector<string>::iterator i=text_dup_removed.begin(); i != text_dup_removed.end(); ++i) */
/* { */
/* cout << *i << endl; */
/* } */
}
bool more_freq (const pair<string,int>& a, const pair<string,int>& b)
{
return a.second > b.second;
}
void output_vocabulary(string sorting)
{
for (vector<string>::iterator i=text_dup_removed.begin(); i != text_dup_removed.end(); ++i)
{
voc_output[*i]++;
}
if (sorting=="freq")
{
vector<pair<string, int> > voc; //(voc_output.begin(), voc_output.end());
for (map<string,int>::iterator i=voc_output.begin();
i != voc_output.end();
++i)
{
voc.push_back(*i);
}
sort(voc.begin(), voc.end(), more_freq);
cout << "-------------------" << endl;
cout << "vocabulary\tcount sorted by freq" << endl;
for (vector<pair<string, int> >::iterator i = voc.begin(); i != voc.end(); ++i)
{
cout << i->first << "\t\t" << i->second << endl;
}
}
else
{
cout << "-------------------" << endl;
cout << "vocabulary\tcount sorted by alph" << endl;
for (map<string,int>::iterator i=voc_output.begin(); i != voc_output.end(); ++i)
{
cout << i->first << "\t\t" << i->second << endl;
}
}
}
//// GUI version
///auroraDlg.cpp
//////////////
void CAuroraDlg::OnOK()
{
const char pszFilter[] = _T("TXT File (*.txt)|*.txt|All Files (*.*)|*.*||");
CFileDialog dlg(TRUE, NULL, NULL, OFN_HIDEREADONLY | OFN_OVERWRITEPROMPT, pszFilter, this);
CString strFilePath;
if(dlg.DoModal() == IDOK)
{
strFilePath = dlg.GetPathName();
}
char file_name[1024];
strcpy(file_name, (LPCTSTR)strFilePath );
error err = OK;
m_text_out = "reading...";
UpdateData(FALSE);
input_text(file_name, err);
m_text_out = "reading...done";
UpdateData(FALSE);
get_skip_table("skip.txt", err);
get_dup_table("dup.txt", err);
get_delimiter_table("space.txt", err);
m_text_out = "segmenting the text...";
UpdateData(FALSE);
segement();
m_text_out = "segmenting the text...done";
UpdateData(FALSE);
m_text_out = "removing spaces...";
UpdateData(FALSE);
remove_skip();
m_text_out = "removing spaces...done";
UpdateData(FALSE);
m_text_out = "merging the words duplicated...";
UpdateData(FALSE);
remove_dup();
m_text_out = "merging the words duplicated...done";
UpdateData(FALSE);
m_text_out = "sorting...";
UpdateData(FALSE);
output_vocabulary("freq");
m_text_out = "sorting...done";
UpdateData(FALSE);
m_text_out = text_out.c_str();
UpdateData(FALSE);
}
//// cai.h for GUI.h
#include <vector>
#include <string>
#include <map>
#include <iostream>
#include <fstream>
#include <sstream>
#include <algorithm>
#include <cctype>
using namespace std;
struct cmd_argument
{
string text_file_name;
string delimiter_table_file_name;
string skip_table_file_name;
string dup_table_file_name;
string sort;
};
vector<string> skip_table;
vector<char> delimiter_table;
map<string, string> dup_table;//varint(meaning) for key, root(mean) for value
string whole_text;
vector<string> text_segmented;
vector<string> text_skip_removed;
vector<string> text_dup_removed;
map<string, int> voc_output;
string text_out;
enum error
{
OK,
cmd_pasrsing_error
};
// cai.exe -text file_name -delimiter file_name -skip file_name -dup file_name -sort freq
cmd_argument parse_cmd_argument(int argc, char* argv[], error& err)
{
cmd_argument cmd;
if (argc != 11)
{
err = cmd_pasrsing_error;
}
else
{
cmd.text_file_name = argv[2];
cmd.delimiter_table_file_name = argv[4];
cmd.skip_table_file_name = argv[6];
cmd.dup_table_file_name = argv[8];
cmd.sort = argv[10];
}
return cmd;
}
void input_text(string file_name, error& err)
{
// cout << "input:" << file_name << endl << endl;
string line;
int line_count = 0;
ifstream f(file_name.c_str());
while(getline(f, line))
{
whole_text += line+" ";
++line_count;
}
string foo = "Some Mixed Case Text";
transform(whole_text.begin(), whole_text.end(), whole_text.begin(), ::tolower);
// cout << line_count << " lines" << endl;
}
void get_skip_table(string file_name, error& err)
{
// cout << "skip:" << file_name << endl << endl;
string line;
ifstream f(file_name.c_str());
while(getline(f, line))
{
skip_table.push_back(line);
}
}
void get_dup_table(string file_name, error& err)
{
// cout << "dup:" << file_name << endl << endl;
string line;
ifstream f(file_name.c_str());
stringstream ss;
string variant;
string root;
string word;
// cout << "---" << endl;
while(getline(f, line))
{
ss.clear();
ss << line;
for (int i = 0; ss >> word; ++i)
{
if (i==0)
{
root = word;
}
else
{
variant = word;
dup_table[variant]=root;
}
}
}
}
void get_delimiter_table(string file_name, error& err)
{
// cout << "delimiter:" << file_name << endl << endl;
char space;
ifstream f(file_name.c_str());
while(f >> space)
{
delimiter_table.push_back(space);
}
}
// whole_text is fileted with delimiter talbe into text_segmented
void segement()
{
// cout << "skip:" ;
/* for (vector<string>::iterator i = skip_table.begin(); i!=skip_table.end() ; ++i) */
/* { */
/* cout << *i << "\t"; */
/* } */
// cout << endl << endl;
// cout << "dup:" << endl;
/* for (map<string, string>::iterator i = dup_table.begin(); i!=dup_table.end() ; ++i) */
/* { */
/* cout << i->first << "\t" << i->second << endl; */
/* } */
// cout << endl << endl;
/* cout << "delimiter:" << endl; */
/* for (vector<char>::iterator i = delimiter_table.begin(); i!=delimiter_table.end() ; ++i) */
/* { */
/* cout << *i << "\t"; */
/* } */
/* cout << endl << endl; */
///
for (vector<char>::iterator i=delimiter_table.begin(); i != delimiter_table.end(); ++i)
{
replace(whole_text.begin(), whole_text.end(), *i, ' ');
}
stringstream ss;
ss << whole_text;
string word;
while(getline(ss, word, ' '))
{
if (word!="")
{
text_segmented.push_back(word);
}
}
/* cout << "----------" << endl; */
/* for (vector<string>::iterator i=text_segmented.begin(); i != text_segmented.end(); ++i) */
/* { */
/* cout << *i << endl; */
/* } */
}
void remove_skip()
{
bool skip = false;
for (vector<string>::iterator i = text_segmented.begin(); i != text_segmented.end(); ++i)
{
for (vector<string>::iterator j = skip_table.begin(); j != skip_table.end(); ++j)
{
if (*i == *j)
{
skip = true;
break;
}
}
if (skip==false)
{
text_skip_removed.push_back(*i);
}
else
{
skip=false;
}
}
/* for (vector<string>::iterator i=text_skip_removed.begin(); i != text_skip_removed.end(); ++i) */
/* { */
/* cout << *i << endl; */
/* } */
}
void remove_dup()
{
for (map<string, string>::iterator j = dup_table.begin(); j != dup_table.end(); ++j)
{
replace(text_skip_removed.begin(), text_skip_removed.end(), j->first, j->second);
}
text_dup_removed = text_skip_removed;
/* for (vector<string>::iterator i=text_dup_removed.begin(); i != text_dup_removed.end(); ++i) */
/* { */
/* cout << *i << endl; */
/* } */
}
bool more_freq (const pair<string,int>& a, const pair<string,int>& b)
{
return a.second > b.second;
}
void output_vocabulary(string sorting)
{
for (vector<string>::iterator i=text_dup_removed.begin(); i != text_dup_removed.end(); ++i)
{
voc_output[*i]++;
}
if (sorting=="freq")
{
vector<pair<string, int> > voc; //(voc_output.begin(), voc_output.end());
for (map<string,int>::iterator i=voc_output.begin();
i != voc_output.end();
++i)
{
voc.push_back(*i);
}
sort(voc.begin(), voc.end(), more_freq);
// cout << "-------------------" << endl;
// cout << "vocabulary\tcount sorted by freq" << endl;
text_out = "vocabulary\tcount sorted by freq\r\n";
vector<pair<string, int> >::iterator j;
for (j = voc.begin(); j != voc.end(); ++j)
{
//cout << j->first << "\t\t" << j->second << endl;
stringstream ss;
string num;
ss << j->second;
ss >> num;
text_out += j->first;
text_out += "\t\t";
text_out += num;
text_out += "\r\n";
}
}
else
{
// cout << "-------------------" << endl;
// cout << "vocabulary\tcount sorted by alph" << endl;
text_out = "vocabulary\tcount sorted by alph\r";
map<string,int>::iterator k;
for (k=voc_output.begin(); k != voc_output.end(); ++k)
{
//cout << k->first << "\t\t" << k->second << endl;
stringstream ss;
string num;
ss << k->second;
ss >> num;
text_out += k->first;
text_out += "\t\t";
text_out += num;
text_out += "\r\n";
}
}
}
/////////////////////////
///////// diff console GUI
$ diff cai.h GUI/aurora/cai.h
29a30
> string text_out;
249,253c250,264
< cout << "-------------------" << endl;
< cout << "vocabulary\tcount sorted by freq" << endl;
< for (vector<pair<string, int> >::iterator i = voc.begin(); i != voc.end(); ++i)
< {
< cout << i->first << "\t\t" << i->second << endl;
---
> // cout << "-------------------" << endl;
> // cout << "vocabulary\tcount sorted by freq" << endl;
> text_out = "vocabulary\tcount sorted by freq\r\n";
> vector<pair<string, int> >::iterator j;
> for (j = voc.begin(); j != voc.end(); ++j)
> {
> //cout << j->first << "\t\t" << j->second << endl;
> stringstream ss;
> string num;
> ss << j->second;
> ss >> num;
> text_out += j->first;
> text_out += "\t\t";
> text_out += num;
> text_out += "\r\n";
258,262c269,283
< cout << "-------------------" << endl;
< cout << "vocabulary\tcount sorted by alph" << endl;
< for (map<string,int>::iterator i=voc_output.begin(); i != voc_output.end(); ++i)
< {
< cout << i->first << "\t\t" << i->second << endl;
---
> // cout << "-------------------" << endl;
> // cout << "vocabulary\tcount sorted by alph" << endl;
> text_out = "vocabulary\tcount sorted by alph\r";
> map<string,int>::iterator k;
> for (k=voc_output.begin(); k != voc_output.end(); ++k)
> {
> //cout << k->first << "\t\t" << k->second << endl;
> stringstream ss;
> string num;
> ss << k->second;
> ss >> num;
> text_out += k->first;
> text_out += "\t\t";
> text_out += num;
> text_out += "\r\n";