LZW Compressor
 All Classes Files Functions Variables Typedefs
lzw_v5.cpp File Reference

LZW file compressor. More...

#include <algorithm>
#include <array>
#include <climits>
#include <cstddef>
#include <cstdint>
#include <cstdlib>
#include <exception>
#include <fstream>
#include <ios>
#include <iostream>
#include <istream>
#include <limits>
#include <memory>
#include <ostream>
#include <stdexcept>
#include <string>
#include <utility>
#include <vector>

Go to the source code of this file.

Classes

class  EncoderDictionary
 Encoder's custom dictionary type. More...
 
struct  EncoderDictionary::Node
 Binary search tree node. More...
 

Typedefs

using CodeType = std::uint16_t
 Type used to store and retrieve codes.
 

Functions

void compress (std::istream &is, std::ostream &os)
 Compresses the contents of is and writes the result to os. More...
 
void decompress (std::istream &is, std::ostream &os)
 Decompresses the contents of is and writes the result to os. More...
 
void print_usage (const std::string &s="", bool su=true)
 Prints usage information and a custom error message. More...
 
int main (int argc, char *argv[])
 Actual program entry point. More...
 

Variables

const CodeType globals::dms {std::numeric_limits<CodeType>::max()}
 Dictionary Maximum Size (when reached, the dictionary will be reset)
 

Detailed Description

LZW file compressor.

Author
Julius Pettersson
Version
5
Remarks
This version borrows heavily from Juha Nieminen's work.

This is the C++11 implementation of a Lempel-Ziv-Welch single-file command-line compressor. It uses the simpler fixed-width code compression method. It was written with Doxygen comments.

See Also
http://en.wikipedia.org/wiki/Lempel%E2%80%93Ziv%E2%80%93Welch
http://marknelson.us/2011/11/08/lzw-revisited/
http://www.cs.duke.edu/csed/curious/compression/lzw.html
http://warp.povusers.org/EfficientLZW/index.html
http://en.cppreference.com/
http://www.doxygen.org/

Definition in file lzw_v5.cpp.

Function Documentation

void compress ( std::istream &  is,
std::ostream &  os 
)

Compresses the contents of is and writes the result to os.

Parameters
[in]isinput stream
[out]osoutput stream

Definition at line 185 of file lzw_v5.cpp.

References EncoderDictionary::search_and_insert(), and EncoderDictionary::search_initials().

Referenced by main().

186 {
188  CodeType i {globals::dms}; // Index
189  char c;
190 
191  while (is.get(c))
192  {
193  const CodeType temp {i};
194 
195  if ((i = ed.search_and_insert(temp, c)) == globals::dms)
196  {
197  os.write(reinterpret_cast<const char *> (&temp), sizeof (CodeType));
198  i = ed.search_initials(c);
199  }
200  }
201 
202  if (i != globals::dms)
203  os.write(reinterpret_cast<const char *> (&i), sizeof (CodeType));
204 }
void decompress ( std::istream &  is,
std::ostream &  os 
)

Decompresses the contents of is and writes the result to os.

Parameters
[in]isinput stream
[out]osoutput stream

Definition at line 211 of file lzw_v5.cpp.

Referenced by main().

212 {
213  std::vector<std::pair<CodeType, char>> dictionary;
214 
215  // "named" lambda function, used to reset the dictionary to its initial contents
216  const auto reset_dictionary = [&dictionary] {
217  dictionary.clear();
218  dictionary.reserve(globals::dms);
219 
220  const long int minc = std::numeric_limits<char>::min();
221  const long int maxc = std::numeric_limits<char>::max();
222 
223  for (long int c = minc; c <= maxc; ++c)
224  dictionary.push_back({globals::dms, static_cast<char> (c)});
225  };
226 
227  const auto rebuild_string = [&dictionary](CodeType k) -> const std::vector<char> * {
228  static std::vector<char> s; // String
229 
230  s.clear();
231 
232  // the length of a string cannot exceed the dictionary's number of entries
233  s.reserve(globals::dms);
234 
235  while (k != globals::dms)
236  {
237  s.push_back(dictionary[k].second);
238  k = dictionary[k].first;
239  }
240 
241  std::reverse(s.begin(), s.end());
242  return &s;
243  };
244 
245  reset_dictionary();
246 
247  CodeType i {globals::dms}; // Index
248  CodeType k; // Key
249 
250  while (is.read(reinterpret_cast<char *> (&k), sizeof (CodeType)))
251  {
252  // dictionary's maximum size was reached
253  if (dictionary.size() == globals::dms)
254  reset_dictionary();
255 
256  if (k > dictionary.size())
257  throw std::runtime_error("invalid compressed code");
258 
259  const std::vector<char> *s; // String
260 
261  if (k == dictionary.size())
262  {
263  dictionary.push_back({i, rebuild_string(i)->front()});
264  s = rebuild_string(k);
265  }
266  else
267  {
268  s = rebuild_string(k);
269 
270  if (i != globals::dms)
271  dictionary.push_back({i, s->front()});
272  }
273 
274  os.write(&s->front(), s->size());
275  i = k;
276  }
277 
278  if (!is.eof() || is.gcount() != 0)
279  throw std::runtime_error("corrupted compressed file");
280 }
int main ( int  argc,
char *  argv[] 
)

Actual program entry point.

Parameters
argcnumber of command line arguments
[in]argvarray of command line arguments
Return values
EXIT_FAILUREfor failed operation
EXIT_SUCCESSfor successful operation

Definition at line 313 of file lzw_v5.cpp.

References compress(), decompress(), and print_usage().

314 {
315  if (argc != 4)
316  {
317  print_usage("Wrong number of arguments.");
318  return EXIT_FAILURE;
319  }
320 
321  enum class Mode {
322  Compress,
323  Decompress
324  };
325 
326  Mode m;
327 
328  if (std::string(argv[1]) == "-c")
329  m = Mode::Compress;
330  else
331  if (std::string(argv[1]) == "-d")
332  m = Mode::Decompress;
333  else
334  {
335  print_usage(std::string("flag `") + argv[1] + "' is not recognized.");
336  return EXIT_FAILURE;
337  }
338 
339  const std::size_t buffer_size {1024 * 1024};
340 
341  // these custom buffers should be larger than the default ones
342  const std::unique_ptr<char[]> input_buffer(new char[buffer_size]);
343  const std::unique_ptr<char[]> output_buffer(new char[buffer_size]);
344 
345  std::ifstream input_file;
346  std::ofstream output_file;
347 
348  input_file.rdbuf()->pubsetbuf(input_buffer.get(), buffer_size);
349  input_file.open(argv[2], std::ios_base::binary);
350 
351  if (!input_file.is_open())
352  {
353  print_usage(std::string("input_file `") + argv[2] + "' could not be opened.");
354  return EXIT_FAILURE;
355  }
356 
357  output_file.rdbuf()->pubsetbuf(output_buffer.get(), buffer_size);
358  output_file.open(argv[3], std::ios_base::binary);
359 
360  if (!output_file.is_open())
361  {
362  print_usage(std::string("output_file `") + argv[3] + "' could not be opened.");
363  return EXIT_FAILURE;
364  }
365 
366  try
367  {
368  input_file.exceptions(std::ios_base::badbit);
369  output_file.exceptions(std::ios_base::badbit | std::ios_base::failbit);
370 
371  if (m == Mode::Compress)
372  compress(input_file, output_file);
373  else
374  if (m == Mode::Decompress)
375  decompress(input_file, output_file);
376  }
377  catch (const std::ios_base::failure &f)
378  {
379  print_usage(std::string("File input/output failure: ") + f.what() + '.', false);
380  return EXIT_FAILURE;
381  }
382  catch (const std::exception &e)
383  {
384  print_usage(std::string("Caught exception: ") + e.what() + '.', false);
385  return EXIT_FAILURE;
386  }
387 
388  return EXIT_SUCCESS;
389 }
void print_usage ( const std::string &  s = "",
bool  su = true 
)

Prints usage information and a custom error message.

Parameters
scustom error message to be printed
suShow Usage information

Definition at line 287 of file lzw_v5.cpp.

Referenced by main().

288 {
289  if (!s.empty())
290  std::cerr << "\nERROR: " << s << '\n';
291 
292  if (su)
293  {
294  std::cerr << "\nUsage:\n";
295  std::cerr << "\tprogram -flag input_file output_file\n\n";
296  std::cerr << "Where `flag' is either `c' for compressing, or `d' for decompressing, and\n";
297  std::cerr << "`input_file' and `output_file' are distinct files.\n\n";
298  std::cerr << "Examples:\n";
299  std::cerr << "\tlzw_v5.exe -c license.txt license.lzw\n";
300  std::cerr << "\tlzw_v5.exe -d license.lzw new_license.txt\n";
301  }
302 
303  std::cerr << std::endl;
304 }