LZW Compressor
 All Classes Files Functions Variables Typedefs Enumerations Enumerator Pages
lzw_v6.cpp
Go to the documentation of this file.
1 
2 
3 
4 
5 
6 
7 
8 
9 
10 
11 
12 
13 
14 
15 
16 
17 
18 
19 
20 
21 
22 
23 #include <algorithm>
24 #include <array>
25 #include <climits>
26 #include <cstddef>
27 #include <cstdint>
28 #include <cstdlib>
29 #include <exception>
30 #include <fstream>
31 #include <ios>
32 #include <iostream>
33 #include <istream>
34 #include <limits>
35 #include <memory>
36 #include <ostream>
37 #include <stdexcept>
38 #include <string>
39 #include <utility>
40 #include <vector>
41 
42 // Safety macro; if not defined, some overkillish safety checks are avoided.
43 //#define TAKE_NO_RISKS
44 
46 using CodeType = std::uint32_t;
47 
48 namespace globals {
49 
51 const CodeType dms {512 * 1024};
52 
53 } // namespace globals
54 
59 enum class MetaCode: CodeType {
60  Eof = 1u << CHAR_BIT,
61 };
62 
67 
71  struct Node {
72 
77  explicit Node(char c): first(globals::dms), c(c), left(globals::dms), right(globals::dms)
78  {
79  }
80 
82  char c;
85  };
86 
87 public:
88 
94  {
95  const long int minc = std::numeric_limits<char>::min();
96  const long int maxc = std::numeric_limits<char>::max();
97  CodeType k {0};
98 
99  for (long int c = minc; c <= maxc; ++c)
100  initials[static_cast<unsigned char> (c)] = k++;
101 
102  vn.reserve(globals::dms);
103  reset();
104  }
105 
110  void reset()
111  {
112  vn.clear();
113 
114  const long int minc = std::numeric_limits<char>::min();
115  const long int maxc = std::numeric_limits<char>::max();
116 
117  for (long int c = minc; c <= maxc; ++c)
118  vn.push_back(Node(c));
119 
120  // add dummy nodes for the metacodes
121  vn.push_back(Node('\x00')); // MetaCode::Eof
122  }
123 
132  {
133  if (i == globals::dms)
134  return search_initials(c);
135 
136  const CodeType vn_size = vn.size();
137  CodeType ci {vn[i].first}; // Current Index
138 
139  if (ci != globals::dms)
140  {
141  while (true)
142  if (c < vn[ci].c)
143  {
144  if (vn[ci].left == globals::dms)
145  {
146  vn[ci].left = vn_size;
147  break;
148  }
149  else
150  ci = vn[ci].left;
151  }
152  else
153  if (c > vn[ci].c)
154  {
155  if (vn[ci].right == globals::dms)
156  {
157  vn[ci].right = vn_size;
158  break;
159  }
160  else
161  ci = vn[ci].right;
162  }
163  else // c == vn[ci].c
164  return ci;
165  }
166  else
167  vn[i].first = vn_size;
168 
169  vn.push_back(Node(c));
170  return globals::dms;
171  }
172 
178  CodeType search_initials(char c) const
179  {
180  return initials[static_cast<unsigned char> (c)];
181  }
182 
186  std::vector<Node>::size_type size() const
187  {
188  return vn.size();
189  }
190 
191 private:
192 
194  std::vector<Node> vn;
195 
197  std::array<CodeType, 1u << CHAR_BIT> initials;
198 };
199 
203 struct ByteCache {
204 
208  ByteCache(): used(0), data(0x00)
209  {
210  }
211 
212  std::size_t used;
213  unsigned char data;
214 };
215 
219 class CodeWriter {
220 public:
221 
226  explicit CodeWriter(std::ostream &os): os(os), bits(CHAR_BIT + 1)
227  {
228  }
229 
235  {
236  write(static_cast<CodeType> (MetaCode::Eof));
237 
238  // write the incomplete leftover byte as-is
239  if (lo.used != 0)
240  os.put(static_cast<char> (lo.data));
241  }
242 
246  std::size_t get_bits() const
247  {
248  return bits;
249  }
250 
255  void reset_bits()
256  {
257  bits = CHAR_BIT + 1;
258  }
259 
266  {
267 #ifdef TAKE_NO_RISKS
268  if (bits == SIZE_MAX)
269  throw std::overflow_error("CodeWriter::increase_bits()");
270 #endif
271  ++bits;
272  }
273 
281  bool write(CodeType k)
282  {
283  std::size_t remaining_bits {bits};
284 
285  if (lo.used != 0)
286  {
287  lo.data |= k << lo.used;
288  os.put(static_cast<char> (lo.data));
289  k >>= CHAR_BIT - lo.used;
290  remaining_bits -= CHAR_BIT - lo.used;
291  lo.used = 0;
292  lo.data = 0x00;
293  }
294 
295  while (remaining_bits != 0)
296  if (remaining_bits >= CHAR_BIT)
297  {
298  os.put(static_cast<char> (k));
299  k >>= CHAR_BIT;
300  remaining_bits -= CHAR_BIT;
301  }
302  else
303  {
304  lo.used = remaining_bits;
305  lo.data = k;
306  break;
307  }
308 
309  return os;
310  }
311 
312 private:
313 
314  std::ostream &os;
315  std::size_t bits;
317 };
318 
322 class CodeReader {
323 public:
324 
329  explicit CodeReader(std::istream &is): is(is), bits(CHAR_BIT + 1), feofmc(false)
330  {
331  }
332 
336  std::size_t get_bits() const
337  {
338  return bits;
339  }
340 
345  void reset_bits()
346  {
347  bits = CHAR_BIT + 1;
348  }
349 
356  {
357 #ifdef TAKE_NO_RISKS
358  if (bits == SIZE_MAX)
359  throw std::overflow_error("CodeReader::increase_bits()");
360 #endif
361  ++bits;
362  }
363 
371  bool read(CodeType &k)
372  {
373  // ready-made bit masks
374  static const std::array<unsigned long int, 9> masks {
375  {0x00, 0x01, 0x03, 0x07, 0x0F, 0x1F, 0x3F, 0x7F, 0xFF}
376  };
377 
378  std::size_t remaining_bits {bits};
379  std::size_t offset {lo.used};
380  unsigned char temp;
381 
382  k = lo.data;
383  remaining_bits -= lo.used;
384  lo.used = 0;
385  lo.data = 0x00;
386 
387  while (remaining_bits != 0 && is.get(reinterpret_cast<char &> (temp)))
388  if (remaining_bits >= CHAR_BIT)
389  {
390  k |= static_cast<CodeType> (temp) << offset;
391  offset += CHAR_BIT;
392  remaining_bits -= CHAR_BIT;
393  }
394  else
395  {
396  k |= static_cast<CodeType> (temp & masks[remaining_bits]) << offset;
397  lo.used = CHAR_BIT - remaining_bits;
398  lo.data = temp >> remaining_bits;
399  break;
400  }
401 
402  if (k == static_cast<CodeType> (MetaCode::Eof))
403  {
404  feofmc = true;
405  return false;
406  }
407 
408  return is;
409  }
410 
416  bool corrupted() const
417  {
418  return !feofmc;
419  }
420 
421 private:
422 
423  std::istream &is;
424  std::size_t bits;
425  bool feofmc;
427 };
428 
434 std::size_t required_bits(unsigned long int n)
435 {
436  std::size_t r {1};
437 
438  while ((n >>= 1) != 0)
439  ++r;
440 
441  return r;
442 }
443 
449 void compress(std::istream &is, std::ostream &os)
450 {
452  CodeWriter cw(os);
453  CodeType i {globals::dms}; // Index
454  char c;
455  bool rbwf {false}; // Reset Bit Width Flag
456 
457  while (is.get(c))
458  {
459  // dictionary's maximum size was reached
460  if (ed.size() == globals::dms)
461  {
462  ed.reset();
463  rbwf = true;
464  }
465 
466  const CodeType temp {i};
467 
468  if ((i = ed.search_and_insert(temp, c)) == globals::dms)
469  {
470  cw.write(temp);
471  i = ed.search_initials(c);
472 
473  if (required_bits(ed.size() - 1) > cw.get_bits())
474  cw.increase_bits();
475  }
476 
477  if (rbwf)
478  {
479  cw.reset_bits();
480  rbwf = false;
481  }
482  }
483 
484  if (i != globals::dms)
485  cw.write(i);
486 }
487 
493 void decompress(std::istream &is, std::ostream &os)
494 {
495  std::vector<std::pair<CodeType, char>> dictionary;
496 
497  // "named" lambda function, used to reset the dictionary to its initial contents
498  const auto reset_dictionary = [&dictionary] {
499  dictionary.clear();
500  dictionary.reserve(globals::dms);
501 
502  const long int minc = std::numeric_limits<char>::min();
503  const long int maxc = std::numeric_limits<char>::max();
504 
505  for (long int c = minc; c <= maxc; ++c)
506  dictionary.push_back({globals::dms, static_cast<char> (c)});
507 
508  // add dummy elements for the metacodes
509  dictionary.push_back({0, '\x00'}); // MetaCode::Eof
510  };
511 
512  const auto rebuild_string = [&dictionary](CodeType k) -> const std::vector<char> * {
513  static std::vector<char> s; // String
514 
515  s.clear();
516 
517  // the length of a string cannot exceed the dictionary's number of entries
518  s.reserve(globals::dms);
519 
520  while (k != globals::dms)
521  {
522  s.push_back(dictionary[k].second);
523  k = dictionary[k].first;
524  }
525 
526  std::reverse(s.begin(), s.end());
527  return &s;
528  };
529 
530  reset_dictionary();
531 
532  CodeReader cr(is);
533  CodeType i {globals::dms}; // Index
534  CodeType k; // Key
535 
536  while (true)
537  {
538  // dictionary's maximum size was reached
539  if (dictionary.size() == globals::dms)
540  {
541  reset_dictionary();
542  cr.reset_bits();
543  }
544 
545  if (required_bits(dictionary.size()) > cr.get_bits())
546  cr.increase_bits();
547 
548  if (!cr.read(k))
549  break;
550 
551  if (k > dictionary.size())
552  throw std::runtime_error("invalid compressed code");
553 
554  const std::vector<char> *s; // String
555 
556  if (k == dictionary.size())
557  {
558  dictionary.push_back({i, rebuild_string(i)->front()});
559  s = rebuild_string(k);
560  }
561  else
562  {
563  s = rebuild_string(k);
564 
565  if (i != globals::dms)
566  dictionary.push_back({i, s->front()});
567  }
568 
569  os.write(&s->front(), s->size());
570  i = k;
571  }
572 
573  if (cr.corrupted())
574  throw std::runtime_error("corrupted compressed file");
575 }
576 
582 void print_usage(const std::string &s = "", bool su = true)
583 {
584  if (!s.empty())
585  std::cerr << "\nERROR: " << s << '\n';
586 
587  if (su)
588  {
589  std::cerr << "\nUsage:\n";
590  std::cerr << "\tprogram -flag input_file output_file\n\n";
591  std::cerr << "Where `flag' is either `c' for compressing, or `d' for decompressing, and\n";
592  std::cerr << "`input_file' and `output_file' are distinct files.\n\n";
593  std::cerr << "Examples:\n";
594  std::cerr << "\tlzw_v6.exe -c license.txt license.lzw\n";
595  std::cerr << "\tlzw_v6.exe -d license.lzw new_license.txt\n";
596  }
597 
598  std::cerr << std::endl;
599 }
600 
608 int main(int argc, char *argv[])
609 {
610  if (argc != 4)
611  {
612  print_usage("Wrong number of arguments.");
613  return EXIT_FAILURE;
614  }
615 
616  enum class Mode {
617  Compress,
618  Decompress
619  };
620 
621  Mode m;
622 
623  if (std::string(argv[1]) == "-c")
624  m = Mode::Compress;
625  else
626  if (std::string(argv[1]) == "-d")
627  m = Mode::Decompress;
628  else
629  {
630  print_usage(std::string("flag `") + argv[1] + "' is not recognized.");
631  return EXIT_FAILURE;
632  }
633 
634  const std::size_t buffer_size {1024 * 1024};
635 
636  // these custom buffers should be larger than the default ones
637  const std::unique_ptr<char[]> input_buffer(new char[buffer_size]);
638  const std::unique_ptr<char[]> output_buffer(new char[buffer_size]);
639 
640  std::ifstream input_file;
641  std::ofstream output_file;
642 
643  input_file.rdbuf()->pubsetbuf(input_buffer.get(), buffer_size);
644  input_file.open(argv[2], std::ios_base::binary);
645 
646  if (!input_file.is_open())
647  {
648  print_usage(std::string("input_file `") + argv[2] + "' could not be opened.");
649  return EXIT_FAILURE;
650  }
651 
652  output_file.rdbuf()->pubsetbuf(output_buffer.get(), buffer_size);
653  output_file.open(argv[3], std::ios_base::binary);
654 
655  if (!output_file.is_open())
656  {
657  print_usage(std::string("output_file `") + argv[3] + "' could not be opened.");
658  return EXIT_FAILURE;
659  }
660 
661  try
662  {
663  input_file.exceptions(std::ios_base::badbit);
664  output_file.exceptions(std::ios_base::badbit | std::ios_base::failbit);
665 
666  if (m == Mode::Compress)
667  compress(input_file, output_file);
668  else
669  if (m == Mode::Decompress)
670  decompress(input_file, output_file);
671  }
672  catch (const std::ios_base::failure &f)
673  {
674  print_usage(std::string("File input/output failure: ") + f.what() + '.', false);
675  return EXIT_FAILURE;
676  }
677  catch (const std::exception &e)
678  {
679  print_usage(std::string("Caught exception: ") + e.what() + '.', false);
680  return EXIT_FAILURE;
681  }
682 
683  return EXIT_SUCCESS;
684 }