c++ - 如何使用 Boost.Spirit.Qi 增量解析(并作用于)大文件？

coder 2024-02-23 原文

我为自定义文本文件格式创建了一个 Qi 解析器。有数以万计的条目要处理，每个条目通常有 1-10 个子条目。我放了一个精简的解析器工作示例 here .

#include <boost/config/warning_disable.hpp>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix_core.hpp>
#include <boost/spirit/include/phoenix_operator.hpp>
#include <boost/spirit/include/phoenix_fusion.hpp>
#include <boost/spirit/include/phoenix_stl.hpp>
#include <boost/spirit/include/phoenix_object.hpp>
#include <boost/spirit/include/support_istream_iterator.hpp>
#include <boost/fusion/include/adapt_struct.hpp>
#include <boost/fusion/include/io.hpp>

#include <fstream>
#include <iostream>
#include <string>

using std::string;
using std::vector;
using std::cout;
using std::endl;

namespace model
{
    namespace qi = boost::spirit::qi;

    struct spectrum
    {
        string comment;
        string file;
        string nativeId;
        double precursorMz;
        int precursorCharge;
        double precursorIntensity;
    };

    struct cluster
    {
        string id;
        vector<spectrum> spectra;
    };

    struct clustering
    {
        string name;
        vector<cluster> clusters;
    };
}

// Tell fusion about the data structures to make them first-class fusion citizens.
// Must be at global scope.

BOOST_FUSION_ADAPT_STRUCT(
    model::spectrum,
    (string, comment)
    (string, file)
    (string, nativeId)
    (double, precursorMz)
    (int, precursorCharge)
    (double, precursorIntensity)
)

BOOST_FUSION_ADAPT_STRUCT(
    model::cluster,
    (string, id)
    (std::vector<model::spectrum>, spectra)
)

BOOST_FUSION_ADAPT_STRUCT(
    model::clustering,
    (string, name)
    (std::vector<model::cluster>, clusters)
)

namespace {
    struct ReportError
    {
      template<typename, typename, typename, typename> struct result { typedef void type; };

      // contract the string to the surrounding new-line characters
      template<typename Iter>
      void operator()(Iter first_iter, Iter last_iter,
                      Iter error_iter, const boost::spirit::qi::info& what) const
      {
        std::string first(first_iter, error_iter);
        std::string last(error_iter, last_iter);
        auto first_pos = first.rfind('\n');
        auto last_pos = last.find('\n');
        auto error_line = ((first_pos == std::string::npos) ? first
                            : std::string(first, first_pos + 1))
                          + std::string(last, 0, last_pos);
        //auto error_pos = (error_iter - first_iter) + 1;
        /*auto error_pos = error
        if (first_pos != std::string::npos)
          error_pos -= (first_pos + 1);*/

        std::cerr << "Error parsing in " << what << std::endl
                  << error_line << std::endl
                  //<< std::setw(error_pos) << '^'
                  << std::endl;
        }
    };

    const boost::phoenix::function<ReportError> report_error = ReportError();
}

namespace model
{
    template <typename Iterator>
    struct cluster_parser : qi::grammar<Iterator, clustering(), qi::blank_type>
    {
        cluster_parser() : cluster_parser::base_type(clusters)
        {
            using qi::int_;
            using qi::lit;
            using qi::double_;
            using qi::bool_;
            using qi::lexeme;
            using qi::eol;
            using qi::ascii::char_;
            using qi::on_error;
            using qi::fail;
            using namespace qi::labels;
            using boost::phoenix::construct;
            using boost::phoenix::val;

            quoted_string %= lexeme['"' > +(char_ - '"') > '"'];

            spectrum_start %=
                lit("SPEC") >
                "#" > +(char_ - "File:") >
                "File:" > quoted_string > lit(",") >
                "NativeID:" > quoted_string >
                bool_ > double_ > int_ > double_;

            cluster_start %= 
                "=Cluster=" > eol >
                "id=" > +(char_ - eol) > eol >
                spectrum_start % eol;

            clusters %= 
                "name=" > +(char_ - eol) > eol >
                eol >
                cluster_start % eol;

            BOOST_SPIRIT_DEBUG_NODES((clusters)(cluster_start)(quoted_string)(spectrum_start))

            //on_error<fail>(clusters, report_error(_1, _2, _3, _4));
            //on_error<fail>(cluster_start, report_error(_1, _2, _3, _4));
            //on_error<fail>(spectrum_start, report_error(_1, _2, _3, _4));
            //on_error<fail>(quoted_string, report_error(_1, _2, _3, _4));

            // on_success(cluster_start, quantify_cluster(_1, _2, _3, _4)); ??
        }

        qi::rule<Iterator, std::string(), qi::blank_type> quoted_string;
        qi::rule<Iterator, cluster(), qi::blank_type> cluster_start;
        qi::rule<Iterator, spectrum(), qi::blank_type> spectrum_start;
        qi::rule<Iterator, clustering(), qi::blank_type> clusters;
    };
}

int main()
{
    using namespace model;

    cluster_parser<boost::spirit::istream_iterator> g; // Our grammar
    string str;
    //std::ifstream input("c:/test/Mo_tai.clustering");
    std::istringstream input("name=GreedyClustering_0.99\n"
"\n"
"=Cluster=\n"
"id=9c8c5830-5841-4f77-b819-64180509615b\n"
"SPEC\t#file=w:\\test\\Mo_Tai_iTRAQ_f4.mgf#id=index=219#title=Mo_Tai_iTRAQ_f4.1254.1254.2 File:\"Mo_Tai_iTRAQ_f4.raw\", NativeID:\"controllerType=0 controllerNumber=1 scan=1254\"\ttrue\t\t300.1374\t2\t\t\t0.0\n"
"=Cluster=\n"
"id=f8f384a1-3d5f-4af1-9581-4d03a5aa3342\n"
"SPEC\t#file=w:\\test\\Mo_Tai_iTRAQ_f9.mgf#id=index=560#title=Mo_Tai_iTRAQ_f9.1666.1666.3 File:\"Mo_Tai_iTRAQ_f9.raw\", NativeID:\"controllerType=0 controllerNumber=1 scan=1666\"\ttrue\t\t300.14413\t3\t\t\t0.0\n"
"SPEC\t#file=w:\\test\\Mo_Tai_iTRAQ_f9.mgf#id=index=520#title=Mo_Tai_iTRAQ_f9.1621.1621.3 File:\"Mo_Tai_iTRAQ_f9.raw\", NativeID:\"controllerType=0 controllerNumber=1 scan=1621\"\ttrue\t\t300.14197\t3\t\t\t0.0\n"
"=Cluster=\n"
"id=b84b79e1-44bc-44c0-a9af-5391ca02582d\n"
"SPEC\t#file=w:\\test\\Mo_Tai_iTRAQ_f2.mgf#id=index=7171#title=Mo_Tai_iTRAQ_f2.12729.12729.2 File:\"Mo_Tai_iTRAQ_f2.raw\", NativeID:\"controllerType=0 controllerNumber=1 scan=12729\"\ttrue\t\t300.15695\t2\t\t\t0.0");
    input.unsetf(std::ios::skipws);
    boost::spirit::istream_iterator begin(input);
    boost::spirit::istream_iterator end;

    clustering clusteringResults;
    bool r = phrase_parse(begin, end, g, qi::blank, clusteringResults);

    if (r && begin == end)
    {
        cout << "Parsing succeeded (" << clusteringResults.clusters.size() << " clusters)\n";

        /*for (size_t i = 0; i < std::min((size_t)10, clusteringResults.clusters.size()); ++i)
        {
            cluster& c = clusteringResults.clusters[i];
            cout << "Cluster " << c.id << " - avg. precursor m/z: " << c.avgPrecursorMz << ", num. spectra: " << c.spectra.size() << endl;
        }*/
        return 1;
    }
    else
    {
        std::cout << "Parsing failed (" << clusteringResults.clusters.size() << " clusters)\n";

        if (!clusteringResults.clusters.empty())
        {
            cluster& c = clusteringResults.clusters.back();
            cout << "Last cluster parsed " << c.id << ", num. spectra: " << c.spectra.size() << endl;
        }
        return 1;
    }
}

我不想在处理之前将整个文件解析到内存中。如何让它在每个簇解析完后排队一个条目(簇)进行处理，处理完删除簇，然后继续解析？更好的方法是让另一个线程异步处理处理。

最佳答案

只需使用流式迭代器。

或者对内存映射文件进行操作。

在处理端，将 Action 从语义 Action 内部推送到队列。

Note: you could run into a supposed bug that doesn't clear the backtrack buffers properly; You might want to check this and take preventative measures as described in this answer: Boost spirit memory leak using flush_multi_pass

Live Demo

#include <boost/fusion/include/adapt_struct.hpp>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix.hpp>
#include <boost/fusion/include/io.hpp>

namespace model
{
    namespace qi = boost::spirit::qi;
    namespace px = boost::phoenix;

    struct spectrum {
        std::string comment;
        std::string file;
        std::string nativeId;
        double      precursorMz;
        int         precursorCharge;
        double      precursorIntensity;
    };

    struct cluster {
        std::string           id;
        std::vector<spectrum> spectra;
    };
}

BOOST_FUSION_ADAPT_STRUCT(model::spectrum, comment, file, nativeId, precursorMz, precursorCharge, precursorIntensity)
BOOST_FUSION_ADAPT_STRUCT(model::cluster, id, spectra)

namespace model
{
    template <typename Iterator>
    struct cluster_parser : qi::grammar<Iterator>
    {
        cluster_parser(std::function<void(std::string const&, model::cluster const&)> handler) 
            :   cluster_parser::base_type(start),
                submit_(handler)
        {
            using namespace qi;

            quoted_string %= lexeme['"' > +(char_ - '"') > '"'];

            spectrum_start %=
                lit("SPEC") >
                "#" > +(char_ - "File:") >
                "File:" > quoted_string > lit(",") >
                "NativeID:" > quoted_string >
                bool_ > double_ > int_ > double_;

            cluster_start %= 
                "=Cluster=" > eol >
                "id=" > +(char_ - eol) > eol >
                spectrum_start % eol;


            clusters %= 
                "name=" > qi::as_string[ +(char_ - eol) ][ name_ = _1 ] > eol > eol >
                cluster_start [ submit_(name_, _1) ] % eol;

            start = skip(blank) [clusters];

            BOOST_SPIRIT_DEBUG_NODES((start)(clusters)(cluster_start)(quoted_string)(spectrum_start))
        }
      private:
        qi::_a_type name_;
        px::function<std::function<void(std::string const&, model::cluster const&)> > submit_;

        qi::rule<Iterator, std::string(), qi::blank_type> quoted_string;
        qi::rule<Iterator, cluster(), qi::blank_type> cluster_start;
        qi::rule<Iterator, spectrum(), qi::blank_type> spectrum_start;
        qi::rule<Iterator, qi::locals<std::string>, qi::blank_type> clusters;
        qi::rule<Iterator> start;
    };
}

int main()
{
    using namespace model;

    cluster_parser<boost::spirit::istream_iterator> g([&](auto const&...){std::cout << "handled\n";}); // Our grammar
    std::string str;
    //std::ifstream input("c:/test/Mo_tai.clustering");

    std::istringstream input(R"(name=GreedyClustering_0.99

=Cluster=
id=9c8c5830-5841-4f77-b819-64180509615b
SPEC    #file=w:\test\Mo_Tai_iTRAQ_f4.mgf#id=index=219#title=Mo_Tai_iTRAQ_f4.1254.1254.2 File:"Mo_Tai_iTRAQ_f4.raw", NativeID:"controllerType=0 controllerNumber=1 scan=1254"   true        300.1374    2           0.0
=Cluster=
id=f8f384a1-3d5f-4af1-9581-4d03a5aa3342
SPEC    #file=w:\test\Mo_Tai_iTRAQ_f9.mgf#id=index=560#title=Mo_Tai_iTRAQ_f9.1666.1666.3 File:"Mo_Tai_iTRAQ_f9.raw", NativeID:"controllerType=0 controllerNumber=1 scan=1666"   true        300.14413   3           0.0
SPEC    #file=w:\test\Mo_Tai_iTRAQ_f9.mgf#id=index=520#title=Mo_Tai_iTRAQ_f9.1621.1621.3 File:"Mo_Tai_iTRAQ_f9.raw", NativeID:"controllerType=0 controllerNumber=1 scan=1621"   true        300.14197   3           0.0
=Cluster=
id=b84b79e1-44bc-44c0-a9af-5391ca02582d
SPEC    #file=w:\test\Mo_Tai_iTRAQ_f2.mgf#id=index=7171#title=Mo_Tai_iTRAQ_f2.12729.12729.2 File:"Mo_Tai_iTRAQ_f2.raw", NativeID:"controllerType=0 controllerNumber=1 scan=12729"   true        300.15695   2           0.0)");
    input.unsetf(std::ios::skipws);
    boost::spirit::istream_iterator begin(input);
    boost::spirit::istream_iterator end;

    bool r = phrase_parse(begin, end, g, qi::blank);

    if (r && begin == end) {
        std::cout << "Parsing succeeded\n";
    }
    else {
        std::cout << "Parsing failed\n";
    }

    if (begin!=end) {
        std::cout << "Unparsed remaining input: '" << std::string(begin, end) << "\n";
    }

    return (r && begin==end)? 0 : 1;
}

打印

handled
handled
handled
Parsing succeeded

奖励:线程 worker

这是一个在线程池上调度集群以进行异步处理的版本。

Note that the submit method posts a lambda to the service. The lambda captures by value because the lifetime of the parameters should extend during the processing.

Live On Coliru

#include <boost/asio.hpp>
#include <boost/thread.hpp>
namespace ba = boost::asio;

struct Processing {
    Processing() {
        for (unsigned i=0; i < boost::thread::hardware_concurrency(); ++i)
            _threads.create_thread([this] { _svc.run(); });
    }

    ~Processing() {
        _work.reset();
        _threads.join_all();
    }

    void submit(std::string const& name, model::cluster const& cluster) {
        _svc.post([=] { do_processing(name, cluster); });
    }

  private:
    void do_processing(std::string const& name, model::cluster const& cluster) {
        std::cout << "Thread " << boost::this_thread::get_id() << ": " << name << " cluster of " << cluster.spectra.size() << " spectra\n";
        boost::this_thread::sleep_for(boost::chrono::milliseconds(950));
    }

    ba::io_service _svc;
    boost::optional<ba::io_service::work> _work = ba::io_service::work(_svc);
    boost::thread_group _threads;
};

[...snip...] 和主要内容:

Processing processing;
auto handler = [&processing](auto&... args) { processing.submit(args...); };

cluster_parser<boost::spirit::istream_iterator> g(handler); // Our grammar

其余未修改，现在打印(例如):

Thread 7f0144a5b700: GreedyClustering_0.99 cluster of 1 spectra
Thread 7f014425a700: GreedyClustering_0.99 cluster of 2 spectra
Parsing succeeded
Thread 7f0143a59700: GreedyClustering_0.99 cluster of 1 spectra

关于c++ - 如何使用 Boost.Spirit.Qi 增量解析(并作用于)大文件？，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/41748596/

大文并作 34 lt cluster c++parsing boost boost-spirit boost-spirit-qi

有关c++ - 如何使用 Boost.Spirit.Qi 增量解析(并作用于)大文件？的更多相关文章

ruby - 如何使用 Nokogiri 的 xpath 和 at_xpath 方法 - 2
我正在学习如何使用Nokogiri，根据这段代码我遇到了一些问题:require'rubygems'require'mechanize'post_agent=WWW::Mechanize.newpost_page=post_agent.get('http://www.vbulletin.org/forum/showthread.php?t=230708')puts"\nabsolutepathwithtbodygivesnil"putspost_page.parser.xpath('/html/body/div/div/div/div/div/table/tbody/tr/td/div
ruby - 如何从 ruby 中的字符串运行任意对象方法？ - 2
总的来说，我对ruby还比较陌生，我正在为我正在创建的对象编写一些rspec测试用例。许多测试用例都非常基础，我只是想确保正确填充和返回值。我想知道是否有办法使用循环结构来执行此操作。不必为我要测试的每个方法都设置一个assertEquals。例如:describeitem,"TestingtheItem"doit"willhaveanullvaluetostart"doitem=Item.new#HereIcoulddotheitem.name.shouldbe_nil#thenIcoulddoitem.category.shouldbe_nilendend但我想要一些方法来使用
ruby - 使用 RubyZip 生成 ZIP 文件时设置压缩级别 - 2
我有一个Ruby程序，它使用rubyzip压缩XML文件的目录树。gem。我的问题是文件开始变得很重，我想提高压缩级别，因为压缩时间不是问题。我在rubyzipdocumentation中找不到一种为创建的ZIP文件指定压缩级别的方法。有人知道如何更改此设置吗？是否有另一个允许指定压缩级别的Ruby库？最佳答案这是我通过查看rubyzip内部创建的代码。level=Zlib::BEST_COMPRESSIONZip::ZipOutputStream.open(zip_file)do|zip|Dir.glob("**/*")d
ruby - 为什么我可以在 Ruby 中使用 Object#send 访问私有(private)/ protected 方法？ - 2
类classAprivatedeffooputs:fooendpublicdefbarputs:barendprivatedefzimputs:zimendprotecteddefdibputs:dibendendA的实例a=A.new测试a.foorescueputs:faila.barrescueputs:faila.zimrescueputs:faila.dibrescueputs:faila.gazrescueputs:fail测试输出failbarfailfailfail.发送测试[:foo,:bar,:zim,:dib,:gaz].each{|m|a.send(m)resc
ruby-on-rails - 使用 Ruby on Rails 进行自动化测试 - 最佳实践 - 2
很好奇，就使用rubyonrails自动化单元测试而言，你们正在做什么？您是否创建了一个脚本来在cron中运行rake作业并将结果邮寄给您？git中的预提交Hook？只是手动调用？我完全理解测试，但想知道在错误发生之前捕获错误的最佳实践是什么。让我们理所当然地认为测试本身是完美无缺的，并且可以正常工作。下一步是什么以确保他们在正确的时间将可能有害的结果传达给您？最佳答案不确定您到底想听什么，但是有几个级别的自动代码库控制:在处理某项功能时，您可以使用类似autotest的内容获得关于哪些有效，哪些无效的即时反馈。要确保您的提
ruby - 在 Ruby 中使用匿名模块 - 2
假设我做了一个模块如下:m=Module.newdoclassCendend三个问题:除了对m的引用之外，还有什么方法可以访问C和m中的其他内容？我可以在创建匿名模块后为其命名吗(就像我输入“module...”一样)？如何在使用完匿名模块后将其删除，使其定义的常量不再存在？最佳答案三个答案:是的，使用ObjectSpace.此代码使c引用你的类(class)C不引用m:c=nilObjectSpace.each_object{|obj|c=objif(Class===objandobj.name=~/::C$/)}当然这取决于
ruby - 其他文件中的 Rake 任务 - 2
我试图在一个项目中使用rake，如果我把所有东西都放到Rakefile中，它会很大并且很难读取/找到东西，所以我试着将每个命名空间放在lib/rake中它自己的文件中，我添加了这个到我的rake文件的顶部:Dir['#{File.dirname(__FILE__)}/lib/rake/*.rake'].map{|f|requiref}它加载文件没问题，但没有任务。我现在只有一个.rake文件作为测试，名为“servers.rake”，它看起来像这样:namespace:serverdotask:testdoputs"test"endend所以当我运行rakeserver:testid时
ruby-on-rails - 在 Rails 中将文件大小字符串转换为等效千字节 - 2
我的目标是转换表单输入，例如“100兆字节”或“1GB”，并将其转换为我可以存储在数据库中的文件大小(以千字节为单位)。目前，我有这个:defquota_convert@regex=/([0-9]+)(.*)s/@sizes=%w{kilobytemegabytegigabyte}m=self.quota.match(@regex)if@sizes.include?m[2]eval("self.quota=#{m[1]}.#{m[2]}")endend这有效，但前提是输入是倍数(“gigabytes”，而不是“gigabyte”)并且由于使用了eval看起来疯狂不安全。所以，功能正常，
ruby - 使用 ruby 和 savon 的 SOAP 服务 - 2
我正在尝试使用ruby和Savon来使用网络服务。测试服务为http://www.webservicex.net/WS/WSDetails.aspx?WSID=9&CATID=2require'rubygems'require'savon'client=Savon::Client.new"http://www.webservicex.net/stockquote.asmx?WSDL"client.get_quotedo|soap|soap.body={:symbol=>"AAPL"}end返回SOAP异常。检查soap信封，在我看来soap请求没有正确的命名空间。任何人都可以建议我
python - 如何使用 Ruby 或 Python 创建一系列高音调和低音调的蜂鸣声？ - 2
关闭。这个问题是opinion-based.它目前不接受答案。想要改进这个问题？更新问题，以便editingthispost可以用事实和引用来回答它.关闭4年前。Improvethisquestion我想在固定时间创建一系列低音和高音调的哔哔声。例如:在150毫秒时发出高音调的蜂鸣声在151毫秒时发出低音调的蜂鸣声200毫秒时发出低音调的蜂鸣声250毫秒的高音调蜂鸣声有没有办法在Ruby或Python中做到这一点？我真的不在乎输出编码是什么(.wav、.mp3、.ogg等等)，但我确实想创建一个输出文件。

c++ - 如何使用 Boost.Spirit.Qi 增量解析(并作用于)大文件？

奖励:线程 worker

有关c++ - 如何使用 Boost.Spirit.Qi 增量解析(并作用于)大文件？的更多相关文章

随机推荐