WoStransform.cc

Go to the documentation of this file.
00001 /* WoStransform.cc */
00002 /* program to transform web-of-science file entries */
00003 
00004 /* Copyright (C) 2002 - 2009, Bernd Speiser */
00005 /* This file is part of WoStransform.
00006 
00007 WoStransform is free software; you can redistribute it and/or
00008 modify it under the terms of the GNU General Public License
00009 as published by the Free Software Foundation; either version 2
00010 of the License, or (at your option) any later version.
00011 
00012 WoStransform is distributed in the hope that it will be useful,
00013 but WITHOUT ANY WARRANTY; without even the implied warranty of
00014 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00015 GNU General Public License for more details.
00016 
00017 You should have received a copy of the GNU General Public License
00018 along with this program; if not, write to the Free Software
00019 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
00020 02111-1307, USA.
00021 */
00022 
00023 #include <iostream>
00024 #include <fstream>
00025 #include <vector>
00026 #include <cstring>
00027 #include <cstdlib>
00028 
00029 #include "WoSentry.h"
00030 
00031 using namespace WoStransform;
00032 
00033 int main (int argc, char *argv[])
00034 {
00035   if (argc == 1)
00036   {
00037     std::cout << "usage: WoStransform [options] -o output_file input_file(s)" 
00038          << std::endl << "or: WoStransform --version" << std::endl;
00039 
00040     exit (1);
00041   }
00042 
00043   if (!strcmp (argv[1], "--version"))
00044   {
00045       std::cout << "WoStransform, version 1.3" << std::endl;
00046 
00047       exit (0);
00048   }
00049 
00050   std::cout << "starting WoStransform - transformation of web-of-science " 
00051        << std::endl 
00052        << "                        files into bibtex format" << std::endl
00053        << "version 1.3, Copyright (C) 2002 - 2009, Bernd Speiser" << std::endl
00054        << "WoStransform comes with ABSOLUTELY NO WARRANTY; for details see"
00055        << std::endl 
00056        << "the General Public License." << std::endl
00057        << "This is free software, and you are welcome to redistribute it"
00058        << std::endl 
00059        << "under certain conditions; see the General Public License for "
00060           "details." 
00061        << std::endl;
00062 
00063   if (argc < 4)
00064   {
00065     std::cout << "wrong number of arguments; " << std::endl <<
00066             "usage: WoStransform [options] -o output_file input_file(s)" 
00067          << std::endl << "or: WoStransform --version" << std::endl;
00068 
00069     exit (1);
00070   }
00071 
00072   int outputfile_pointer = 0;
00073 
00074   bool verbose = false;
00075 
00076   for (int i = 1; i < argc; i++)
00077   {
00078     if (!strcmp (argv[i], "--verbose"))
00079       verbose = true;
00080 
00081     else if (!strcmp (argv[i],"-o"))
00082       outputfile_pointer = i + 1;
00083   }
00084 
00085   if (!outputfile_pointer)
00086   {
00087     std::cout << "no outputfile given; usage: WoStransform "
00088                  "[options] -o output_file input_file(s) " << std::endl;
00089 
00090     exit (1);
00091   }
00092 
00093   std::vector <WoSentry *> entries;
00094 
00095   const int buffersize = 1000;
00096 
00097   char buffer[buffersize];
00098 
00099   int entry_counter = 0;
00100 
00101   for (int j = outputfile_pointer + 1; j < argc; j++)
00102   {
00103     std::ifstream WoSinput (argv[j]);
00104 
00105     if (!WoSinput)
00106     {
00107       std::cout << "can't open file: " << argv[j] << std::endl;
00108 
00109       exit (1);
00110     }
00111 
00112     if (verbose)
00113       std::cout << "reading input file: `" << argv[j] << "´" << std::endl;
00114 
00115     WoSinput.getline (buffer, buffersize);
00116 
00117     if (strcmp (buffer, "FN ISI Export Format"))
00118       std::cout << "file `" << argv[j] 
00119            << "´ not in recognized web-of-science format, skipped" 
00120                                                          << std::endl;
00121     else
00122     {
00123       int line_number = 1;
00124 
00125       std::string bufferstring;
00126 
00127       std::string entrytext;
00128 
00129       while (WoSinput.getline (buffer, buffersize))
00130       {
00131         line_number++;
00132 
00133         bufferstring = buffer;
00134 
00135         bufferstring += '\n';
00136 
00137         if (bufferstring.substr (0,3) == "PT ")
00138         {
00139           entrytext += bufferstring;
00140 
00141           if (bufferstring.substr (3) == "Journal\n" 
00142                                     || bufferstring.substr (3) == "J\n"
00143                                     || bufferstring.substr (3) == "C\n")
00144           {
00145             if (verbose)
00146               std::cout << "generating new entry of type " 
00147                 << bufferstring.substr 
00148                                   (3, bufferstring.find_last_of ('\n') -
00149                   bufferstring.find_first_not_of (" ", 3)) << std::endl;
00150 
00151             entry_counter++;
00152 
00153             WoSentry *entry = new JournalWoSentry;
00154 
00155             while (WoSinput.getline (buffer, buffersize))
00156             {
00157               line_number++;
00158 
00159               bufferstring = buffer;
00160 
00161               bufferstring += '\n';
00162 
00163               if (bufferstring.substr (0,2) == "ER")
00164               {
00165                 *entry << entrytext;
00166 
00167                 entries.push_back (entry);
00168 
00169                 entrytext = "";
00170 
00171                 break;
00172               }
00173 
00174               entrytext += bufferstring;
00175             }
00176           }
00177 
00178           else if (bufferstring.substr (3) == "Book in series\n"
00179                                      || bufferstring.substr (3) == "S\n") 
00180           {
00181             if (verbose)
00182               std::cout << "generating new entry of type " 
00183                 << bufferstring.substr 
00184                                   (3, bufferstring.find_last_of ('\n') -
00185                   bufferstring.find_first_not_of (" ", 3)) << std::endl;
00186 
00187             WoSentry *entry = new BookinseriesWoSentry;
00188 
00189             while (WoSinput.getline (buffer, buffersize))
00190             {
00191               line_number++;
00192 
00193               bufferstring = buffer;
00194 
00195               bufferstring += '\n';
00196 
00197               if (bufferstring.substr (0,2) == "ER")
00198               {
00199                 *entry << entrytext;
00200 
00201                 entries.push_back (entry);
00202 
00203                 entrytext = "";
00204 
00205                 break;
00206               }
00207 
00208               entrytext += bufferstring;
00209             }
00210           }
00211 
00212           else
00213           {
00214             std::cout << "unknown publication type `" 
00215               << bufferstring.substr (3, 
00216                 bufferstring.find_last_of ('\n') - 
00217                   bufferstring.find_first_not_of (" ", 3)) << 
00218                     "' in line "<< line_number << " of file " << argv[j]
00219                                                            << std::endl;
00220           }
00221         }
00222       }
00223     }
00224   }
00225 
00226   std::vector<WoSentry *>::const_iterator entrypointer;
00227 
00228   std::cout << "writing to " << argv[outputfile_pointer] << std::endl;
00229 
00230   std::ofstream bibtexoutput (argv[outputfile_pointer]);
00231 
00232   if (!bibtexoutput)
00233   {
00234     std::cout << "can't open file: " 
00235                                << argv[outputfile_pointer] << std::endl;
00236     exit (1);
00237   }
00238 
00239   for (entrypointer = entries.begin (); 
00240                         entrypointer != entries.end (); ++ entrypointer)
00241   {
00242     (*entrypointer)->deconstruct ();
00243 
00244     (*entrypointer)->format_authors ();
00245 
00246     (*entrypointer)->output (bibtexoutput);
00247   }
00248 
00249   if (entry_counter == 1)
00250     std::cout << entry_counter << " entry generated " << std::endl;
00251 
00252   else if (entry_counter > 1)
00253     std::cout << entry_counter << " entries generated " << std::endl;
00254 
00255   exit (0);
00256 }
00257 
00258 void JournalWoSentry::deconstruct (void)
00259 {
00260   std::string::size_type title_begin;
00261   std::string::size_type lineend;
00262 
00263   title_begin = text.find ("\nTI"); 
00264 
00265   title_begin = text.find_first_not_of (" ", title_begin + 3);
00266 
00267   title = "";
00268 
00269   while (title_begin != std::string::npos)
00270   {
00271     lineend = text.find ("\n", title_begin);
00272 
00273     title += text.substr (title_begin, lineend - title_begin);
00274 
00275     if (text.at (lineend + 1) != ' ')
00276       break;
00277 
00278     title_begin = lineend + 3;
00279   }
00280 
00281   std::string::size_type journal_begin;
00282 
00283   journal_begin = text.find ("\nJI"); 
00284 
00285   journal_begin = text.find_first_not_of (" ", journal_begin + 3);
00286 
00287   journal = "";
00288 
00289   while (journal_begin != std::string::npos)
00290   {
00291     lineend = text.find ("\n", journal_begin);
00292 
00293     journal += text.substr (journal_begin, lineend - journal_begin);
00294 
00295     if (text.at (lineend + 1) != ' ')
00296       break;
00297 
00298     journal_begin = lineend + 3;
00299   }
00300 
00301   std::string::size_type authors_begin;
00302 
00303   authors_begin = text.find ("\nAU"); 
00304 
00305   authors_begin = text.find_first_not_of (" ", authors_begin + 3);
00306 
00307   authors = "";
00308 
00309   while (authors_begin != std::string::npos)
00310   {
00311     lineend = text.find ("\n", authors_begin);
00312 
00313     authors += text.substr (authors_begin, lineend - authors_begin);
00314 
00315     authors += ";";
00316 
00317     if (text.at (lineend + 1) != ' ')
00318       break;
00319 
00320     authors_begin = lineend + 3;
00321   }
00322 
00323   std::string::size_type printing_year_begin;
00324 
00325   printing_year_begin = text.find ("\nPY"); 
00326 
00327   printing_year_begin 
00328                 = text.find_first_not_of (" ", printing_year_begin + 3);
00329 
00330   printing_year = "";
00331 
00332   lineend = text.find ("\n", printing_year_begin);
00333 
00334   printing_year += 
00335        text.substr (printing_year_begin, lineend - printing_year_begin);
00336 
00337   std::string::size_type begin_page_begin;
00338 
00339   begin_page_begin = text.find ("\nBP"); 
00340 
00341   begin_page_begin 
00342                 = text.find_first_not_of (" ", begin_page_begin + 3);
00343 
00344   begin_page = "";
00345 
00346   lineend = text.find ("\n", begin_page_begin);
00347 
00348   begin_page += 
00349        text.substr (begin_page_begin, lineend - begin_page_begin);
00350 
00351   std::string::size_type end_page_begin;
00352 
00353   end_page_begin = text.find ("\nEP"); 
00354 
00355   end_page_begin 
00356                 = text.find_first_not_of (" ", end_page_begin + 3);
00357 
00358   end_page = "";
00359 
00360   lineend = text.find ("\n", end_page_begin);
00361 
00362   end_page += 
00363        text.substr (end_page_begin, lineend - end_page_begin);
00364 
00365   std::string::size_type volume_begin;
00366 
00367   volume_begin = text.find ("\nVL"); 
00368 
00369   volume = "";
00370 
00371   if (volume_begin != std::string::npos)
00372   {
00373     volume_begin = text.find_first_not_of (" ", volume_begin + 3);
00374 
00375     lineend = text.find ("\n", volume_begin);
00376 
00377     volume += text.substr (volume_begin, lineend - volume_begin);
00378   }
00379 }
00380 
00381 std::ofstream & JournalWoSentry::output (std::ofstream &outstream)
00382 {
00383   outstream << "@article{" << key << "," << std::endl;
00384 
00385   outstream << "author = {" << authors << "}," << std::endl;
00386 
00387   outstream << "journal = {" << journal << "}," << std::endl;
00388 
00389   if (!volume.empty ())
00390     outstream << "volume = {" << volume << "}," << std::endl;
00391 
00392   outstream << "pages = {" << begin_page << "~--~" << end_page 
00393                                                    << "}," << std::endl;
00394 
00395   outstream << "year = {" << printing_year << "}," << std::endl;
00396 
00397   outstream << "title = {" << title << "}," << std::endl;
00398 
00399   outstream << "topics = {}," << std::endl;
00400   outstream << "copy = {}" << std::endl;
00401 
00402   outstream << "}" << std::endl;
00403 }
00404 
00405 void WoSentry::format_authors (void)
00406 {
00407   std::string buffer = authors;
00408 
00409   authors = "";
00410 
00411   std::string author;
00412   std::string correct_author;
00413   std::string lastname;
00414   std::string firstnames;
00415   std::string firstnames_string;
00416 
00417   std::vector<std::string> lastnames;
00418 
00419   std::string::size_type begin_author;
00420   std::string::size_type end_author;
00421   std::string::size_type end_authorstring;
00422   std::string::size_type end_firstname;
00423   std::string::size_type index;
00424 
00425   begin_author = buffer.find_first_not_of (" ");  
00426   end_author = buffer.find (";", begin_author);  
00427   end_authorstring = buffer.find_last_of (";");  
00428 
00429   while (end_author <= end_authorstring)
00430   {
00431     author = buffer.substr (begin_author, end_author - begin_author);
00432 
00433     end_firstname = author.find_last_of (" ");
00434 
00435     lastname = author.substr (0, author.find_last_of (","));
00436 
00437     lastnames.push_back (lastname);
00438 
00439     firstnames_string 
00440                   = author.substr (author.find_last_of (" ") + 1) + " ";
00441 
00442     firstnames 
00443       = firstnames_string.substr 
00444                             (firstnames_string.find_first_not_of (" "));
00445 
00446     std::string::iterator pointer = firstnames.begin ();
00447 
00448     index = firstnames.find_first_not_of (" ");
00449 
00450     int idx = 0;
00451 
00452     while (firstnames.at(idx) != ' ')
00453     {
00454       firstnames.insert (++index, ".");
00455 
00456       index++;
00457 
00458       idx += 2;
00459     }
00460 
00461     correct_author = firstnames + lastname;
00462 
00463     if (authors.empty ())
00464       authors += correct_author;
00465 
00466     else
00467       authors = authors + " and " + correct_author;
00468 
00469     if (end_author == end_authorstring)
00470       break;
00471 
00472     begin_author = buffer.find_first_not_of (" ", ++end_author);  
00473     end_author = buffer.find (";", begin_author);  
00474 
00475   }
00476 
00477   int authornumber = lastnames.size ();
00478 
00479   std::vector<std::string>::const_iterator lastname_pointer;
00480 
00481   lastname_pointer = lastnames.begin ();
00482 
00483   if (authornumber == 1)
00484     key = *lastname_pointer + "_" + printing_year;
00485 
00486   else if (authornumber == 2)
00487     key = *lastname_pointer + "/" + *(lastname_pointer + 1) + "_" 
00488                                                                 + printing_year;
00489 
00490   else if (authornumber > 2)
00491     key = *lastname_pointer + "/" + *(lastname_pointer + 1) + "/etal_" 
00492                                                                 + printing_year;
00493 
00494   else
00495     std::cout << "no author found!!!" << std::endl;
00496 
00497 }
00498 
00499 void BookinseriesWoSentry::deconstruct (void)
00500 {
00501   std::string::size_type title_begin;
00502   std::string::size_type lineend;
00503 
00504   title_begin = text.find ("\nTI"); 
00505 
00506   title_begin = text.find_first_not_of (" ", title_begin + 3);
00507 
00508   title = "";
00509 
00510   while (title_begin != std::string::npos)
00511   {
00512     lineend = text.find ("\n", title_begin);
00513 
00514     title += text.substr (title_begin, lineend - title_begin);
00515 
00516     if (text.at (lineend + 1) != ' ')
00517       break;
00518 
00519     title_begin = lineend + 3;
00520   }
00521 
00522   std::string::size_type series_begin;
00523 
00524   series_begin = text.find ("\nSE"); 
00525 
00526   series_begin = text.find_first_not_of (" ", series_begin + 3);
00527 
00528   series = "";
00529 
00530   while (series_begin != std::string::npos)
00531   {
00532     lineend = text.find ("\n", series_begin);
00533 
00534     series += text.substr (series_begin, lineend - series_begin);
00535 
00536     if (text.at (lineend + 1) != ' ')
00537       break;
00538 
00539     series_begin = lineend + 3;
00540   }
00541 
00542   std::string::size_type authors_begin;
00543 
00544   authors_begin = text.find ("\nAU"); 
00545 
00546   authors_begin = text.find_first_not_of (" ", authors_begin + 3);
00547 
00548   authors = "";
00549 
00550   while (authors_begin != std::string::npos)
00551   {
00552     lineend = text.find ("\n", authors_begin);
00553 
00554     authors += text.substr (authors_begin, lineend - authors_begin);
00555 
00556     authors += ";";
00557 
00558     if (text.at (lineend + 1) != ' ')
00559       break;
00560 
00561     authors_begin = lineend + 3;
00562   }
00563 
00564   std::string::size_type printing_year_begin;
00565 
00566   printing_year_begin = text.find ("\nPY"); 
00567 
00568   printing_year_begin 
00569                 = text.find_first_not_of (" ", printing_year_begin + 3);
00570 
00571   printing_year = "";
00572 
00573   lineend = text.find ("\n", printing_year_begin);
00574 
00575   printing_year += 
00576        text.substr (printing_year_begin, lineend - printing_year_begin);
00577 
00578   std::string::size_type begin_page_begin;
00579 
00580   begin_page_begin = text.find ("\nBP"); 
00581 
00582   begin_page_begin 
00583                 = text.find_first_not_of (" ", begin_page_begin + 3);
00584 
00585   begin_page = "";
00586 
00587   lineend = text.find ("\n", begin_page_begin);
00588 
00589   begin_page += 
00590        text.substr (begin_page_begin, lineend - begin_page_begin);
00591 
00592   std::string::size_type end_page_begin;
00593 
00594   end_page_begin = text.find ("\nEP"); 
00595 
00596   end_page_begin 
00597                 = text.find_first_not_of (" ", end_page_begin + 3);
00598 
00599   end_page = "";
00600 
00601   lineend = text.find ("\n", end_page_begin);
00602 
00603   end_page += 
00604        text.substr (end_page_begin, lineend - end_page_begin);
00605 
00606   std::string::size_type volume_begin;
00607 
00608   volume_begin = text.find ("\nVL"); 
00609 
00610   volume = "";
00611 
00612   if (volume_begin != std::string::npos)
00613   {
00614     volume_begin = text.find_first_not_of (" ", volume_begin + 3);
00615 
00616     lineend = text.find ("\n", volume_begin);
00617 
00618     volume += text.substr (volume_begin, lineend - volume_begin);
00619   }
00620 }
00621 
00622 std::ofstream & BookinseriesWoSentry::output (std::ofstream &outstream)
00623 {
00624   outstream << "@article{" << key << "," << std::endl;
00625 
00626   outstream << "author = {" << authors << "}," << std::endl;
00627 
00628   outstream << "journal = {" << series << "}," << std::endl;
00629 
00630   if (!volume.empty ())
00631     outstream << "volume = {" << volume << "}," << std::endl;
00632 
00633   outstream << "pages = {" << begin_page << "~--~" << end_page 
00634                                                    << "}," << std::endl;
00635 
00636   outstream << "year = {" << printing_year << "}," << std::endl;
00637 
00638   outstream << "title = {" << title << "}," << std::endl;
00639 
00640   outstream << "topics = {}," << std::endl;
00641   outstream << "copy = {}" << std::endl;
00642 
00643   outstream << "}" << std::endl;
00644 }

Generated on Tue Mar 24 20:31:33 2009 for WoStransform by  doxygen 1.5.3