XipZ
Mini packer ▶►▸ for small programs.
xipz.cc
Go to the documentation of this file.
1 #include <inttypes.h>
2 #include <iostream>
3 #include <fstream>
4 #include <vector>
5 #include <iterator>
6 #include <stdexcept>
7 #include <algorithm>
8 #include <numeric>
9 #include <sstream>
10 #include <boost/format.hpp>
11 #include "cmdline.h"
12 #include "data.hh"
13 #include "qadz.hh"
14 
23 /*
24 00000000 01 08 0a 08 02 03 9e 32 30 36 31 00 00 00 a2 08 |.......2061.....|
25 00000010 bd 35 08 95 58 ca 10 f8 20 bf a3 78 b9 3e 08 99 |.5..X... ..x.>..|
26 00000020 f7 00 c8 d0 f7 a5 58 18 69 ff aa a5 59 69 00 8d |......X.i...Yi..|
27 End address to shift to: $36
28 Pointer to End of Data: $38. Set to $…+table_size+length_of_compressed_data.
29 Pointer to Begin of Data: $3d. Set to $…+table_size
30 00000030 19 01 98 4c ff 00 00 10 7d 08 54 37 44 7d 08 a8 |...L....}.T7D}..|
31 N: $46
32 00000040 f0 03 a0 08 2c a0 05 46 2b d0 14 66 2b e8 d0 0f |....,..F+..f+...|
33 High byte to stop reading at: $58
34 Jump: $5c
35 00000050 ee 19 01 48 ad 19 01 c9 10 d0 03 4c e2 fc 68 1e |...H.......L..h.|
36 Destination address: $6f
37 00000060 00 00 2a 88 30 d9 d0 df b0 04 a8 b9 36 01 8d 3c |..*.0.......6..<|
38 00000070 03 a0 00 98 ee 27 01 d0 ce ee 28 01 d0 c9 |.....'....(...|
39 */
40 #include "decrunchxipzstub.inc"
42 #define POS_OF_END_OF_CDATA 0x38
43 #define POS_OF_BEGIN_OF_CDATA 0x3d
45 #define POS_OF_N 0x46
47 #define POS_OF_DEST 0x6f
49 #define POS_OF_JMP 0x5c
51 #define POS_OF_STOPREADING 0x58
53 
54 
55 /* \brief Simple structure to store bits.
56  *
57  * This structure is used to store the bits for the compressor, it is
58  * used in the \ref CompressionBits array.
59  */
60 struct Bits {
61  unsigned data;
62  unsigned bits;
63 
65  Bits() : data(0), bits(0) {}
71  Bits(unsigned data_, unsigned n) : data(data_), bits(n) {}
72 };
73 
74 
80 std::ostream &operator<<(std::ostream &o, const Bits &x) {
81  for(int i = x.bits - 1; i >= 0; --i) {
82  if((x.data & (1 << i)) != 0) {
83  o << '1';
84  } else {
85  o << '0';
86  }
87  }
88  return o;
89 }
90 
91 
96 class HistEntry {
97 public:
98  uint8_t byte;
99  unsigned long freq;
100 
101  bool operator<(const HistEntry &x) const { return freq > x.freq; }
102 };
103 
109 typedef std::vector<HistEntry> HistorgramArray;
110 
112 typedef std::array<Bits, 256> CompressionBits;
113 
114 
123 Data read_data(const std::string &fname) {
124  std::vector<uint8_t> rawdata;
125  std::ifstream inp(fname, std::ios::binary);
126  uint8_t tmp;
127 
128  if(!inp) {
129  std::ostringstream out;
130  out << "can not open file '" << fname << '\'';
131  throw std::runtime_error(out.str());
132  };
133  //So such iterator? std::copy(std::istreambuf_iterator<uint8_t>(inp), std::istreambuf_iterator<uint8_t>(), std::back_inserter(rawdata));
134  do {
135  tmp = inp.get();
136  if(!inp.eof()) {
137  rawdata.push_back(tmp);
138  }
139  } while(inp);
140  Data data(rawdata);
141  std::cout << "Bytes read (without load address): " << data.size() << std::endl;
142  std::cout << "Load address: " << data.get_loadaddr() << std::endl;
143  return data;
144 }
145 
146 
159  std::array<unsigned long, 256> histo;
160  HistorgramArray shisto;
161 
162  // Clear the histogram array (every body occurs zero times).
163  histo.fill(0);
164  // For each byte increment the occurrence counter.
165  for(uint8_t i : data) {
166  ++histo[i];
167  }
168  for(int i = 0; i < 256; ++i) {
169  shisto.push_back(HistEntry{static_cast<uint8_t>(i), histo[i]});
170  }
171  std::sort(shisto.begin(), shisto.end());
172  return shisto;
173 }
174 
175 
184 std::ostream &operator<<(std::ostream &o, const HistEntry &x) {
185  o << '(' << x.freq << " * $" << std::hex << (int)x.byte << std::dec << ')';
186  return o;
187 }
188 
189 
199 float calc_comp(int n, const Data &data, const HistorgramArray &histarr) {
200  int two_n = 1 << n;
201  unsigned long compressed_bytes = 0;
202  unsigned long total_bytes = data.size();
203 
204  std::for_each(histarr.begin(), histarr.begin() + two_n, [&compressed_bytes](const HistEntry &x) { compressed_bytes += x.freq; });
205  unsigned long literals = total_bytes - compressed_bytes;
206  float after_compression = 8 * literals; // Number of literal *bits*.
207  after_compression += n * compressed_bytes; // Compressed bits.
208  after_compression += total_bytes; // A bit for every compressed data token (compressed/literal)?
209  after_compression /= 8; // Calculate bytes.
210  std::cout << boost::format("Total compressed bytes (n=%d): %5.3f (%.8e:1)\n") % n % after_compression % (total_bytes / after_compression);
211  return after_compression;
212 }
213 
214 
226  CompressionBits bits;
227  int i;
228 
229  for(i = 0; i < 256; ++i) {
230  bits[i] = Bits(i, 8);
231  }
232  for(i = 0; i < (1 << n); ++i) {
233  bits[compressable[i].byte] = Bits(i, n);
234  }
235  return bits;
236 }
237 
238 
253 std::ostream &write_stub(std::ostream &out, int n, uint16_t size, uint16_t loadaddr, uint16_t jmp) {
254  // Create a local copy.
255  std::vector<uint8_t> stub(decrunchxipzstub, decrunchxipzstub + decrunchxipzstub_len);
256  unsigned endptr = stub.at(POS_OF_END_OF_CDATA) | (stub.at(POS_OF_END_OF_CDATA + 1) << 8);
257  unsigned beginptr = stub.at(POS_OF_BEGIN_OF_CDATA) | (stub.at(POS_OF_BEGIN_OF_CDATA + 1) << 8);
258 
259  // Assign number of bits.
260  stub.at(POS_OF_N) = n;
261  // Assign new begin of compressed data.
262  beginptr += 1 << n; // Add the current table size.;
263  stub.at(POS_OF_BEGIN_OF_CDATA) = beginptr & 0xFF;
264  stub.at(POS_OF_BEGIN_OF_CDATA + 1) = (beginptr >> 8) & 0xFF;
265  // Assign new end of compressed data.
266  endptr += 1 << n; // Add the current table size.
267  endptr += size; // Add number of bytes of compressed data.
268  endptr += 1; // End pointer must point to the byte *after* the data.
269  stub.at(POS_OF_END_OF_CDATA) = endptr & 0xFF;
270  stub.at(POS_OF_END_OF_CDATA + 1) = (endptr >> 8) & 0xFF;
271  // Assign the new jmp position.
272  stub.at(POS_OF_JMP) = jmp & 0xFF;
273  stub.at(POS_OF_JMP + 1) = (jmp >> 8) & 0xFF;
274  // Assign the destination position (currently equals the jump).
275  stub.at(POS_OF_DEST) = loadaddr & 0xFF;
276  stub.at(POS_OF_DEST + 1) = (loadaddr >> 8) & 0xFF;
277  // Set the maximal read position high-byte.
278  stub.at(POS_OF_STOPREADING) = 0x10; // Todo: configurable!
279  // Now copy the modified stub.
280  std::copy(stub.begin(), stub.end(), std::ostream_iterator<unsigned char>(out));
281  return out;
282 }
283 
284 
295 std::ostream &write_compression_table(std::ostream &out, int n, const HistorgramArray &histe) {
296  for(int i = 0; i < (1 << n); ++i) {
297  const HistEntry &curr = histe.at(i);
298  out << curr.byte;
299  }
300  return out;
301 }
302 
303 
312 std::ostream &write_compressed_data(std::ostream &out, const std::vector<uint8_t> &data) {
313  std::copy(data.begin(), data.end(), std::ostream_iterator<unsigned char>(out));
314  return out;
315 }
316 
317 
333 std::vector<uint8_t> create_compressed_data(const Data &data, const CompressionBits &compbits) {
334  unsigned long bitstore = 0;
335  int bit = 0;
336  std::vector<uint8_t> out;
337 
338  for(auto i: data) {
339  bitstore <<= 1; // Move one bit to the left.
340  ++bit;
341  if(compbits[i].bits == 8) {
342  // This is uncompressed!
343  bitstore |= 1;
344  }
345  // Output the data.
346  bitstore <<= compbits[i].bits;
347  bit += compbits[i].bits;
348  bitstore |= compbits[i].data;
349  while(bit >= 8) {
350  out.push_back(static_cast<char>(bitstore >> (bit - 8)));
351  bit -= 8;
352  }
353 #ifdef DEBUG
354  std::cout << "█" << std::hex << (int)i << std::dec<< "🠢 " << compbits[i];
355 #endif
356  }
357  // Write remaining bits...
358  if(bit > 0) {
359  int fillbits = (bit % 8);
360  bitstore <<= fillbits;
361  bit += fillbits;
362  }
363  while(bit >= 8) {
364  out.push_back(static_cast<char>(bitstore >> (bit - 8)));
365  bit -= 8;
366  }
367  return out;
368 }
369 
370 
375 void output_64_common(const HistorgramArray &shisto) {
376  int count = 0;
377 
378  std::cout << "64 most common bytes:\n\t";
379  std::for_each(shisto.begin(), shisto.begin() + 64, [&count](const HistEntry &i) {
380  if(count++ >= 8) {
381  std::cout << "\n\t";
382  count = 0;
383  }
384  if(i.freq > 0) {
385  std::cout << ' ' << i;
386  }
387  });
388  std::cout << std::endl;
389 }
390 
400 int choose_optimal_n(const Data &data, const HistorgramArray &shisto) {
401  int n = 0;
402  float minsize = data.size();
403  float f;
404 
405  for(int i = 1; i <= 6; ++i) {
406  f = calc_comp(i, data, shisto);
407  if(f < minsize) {
408  n = i;
409  minsize = f;
410  }
411  }
412  return n;
413 }
414 
415 
425 int main_xipz(const std::string &inputname, const std::string &outputname, bool raw, int jump) {
426  Data data(read_data(inputname));
427  HistorgramArray shisto(calc_histo(data));
428  output_64_common(shisto);
429  int n = choose_optimal_n(data, shisto);
430  std::cout << "Optimal number of bits: N=" << n << std::endl;
431  CompressionBits compbits(create_compression_bits(shisto, n));
432  uint16_t jumpaddr = jump < 0 ? data.get_loadaddr() : jump;
433  std::ofstream out(outputname, std::ios::binary);
434  std::vector<uint8_t> cdata(create_compressed_data(data, compbits));
435  if(raw) {
436  std::cout << "Skipping writing the decrunching stub!\n";
437  } else {
438  std::cout << "Writing decrunching stub...\n";
439  write_stub(out, n, cdata.size(), data.get_loadaddr(), jumpaddr);
440  }
441  std::cout << boost::format("Writing table, %d bytes...\n") % (1 << n);
442  write_compression_table(out, n, shisto);
443  std::cout << boost::format("Writing %u bytes compressed data...\n") % cdata.size();
444  write_compressed_data(out, cdata);
445  return 0;
446 }
447 
457 int main_qadz(const std::string &inputname, const std::string &outputname, bool raw, int jump) {
458  uint16_t jumpaddr;
459 
460  Data data(read_data(inputname));
461  std::vector<uint8_t> compressed(crunch_qadz(data));
462  std::cout << "Compressed size: " << compressed.size() << std::endl;
463  std::ofstream out(outputname);
464  if(!raw) {
465  if(jump >= 0) {
466  jumpaddr = jump;
467  } else {
468  jumpaddr = data.get_loadaddr();
469  }
470  std::cout << "Writing decrunching stub...\n";
471  write_qadz_stub(out, compressed.size(), data.get_loadaddr(), jumpaddr);
472  }
473  write_compressed_data(out, compressed);
474  return 0;
475 }
476 
477 
482 int main(int argc, char **argv) {
483  gengetopt_args_info args;
484  int ret = -1;
485 
486  if(cmdline_parser(argc, argv, &args) != 0) {
487  return 1;
488  } else {
489  if(args.inputs_num < 1) {
490  std::cerr << "At least one filename must be provided!\n";
491  return 1;
492  }
493  std::cout << "XipZ Version " << CMDLINE_PARSER_VERSION << std::endl;
494  try {
495  std::string inpnam(args.inputs[0]);
496  std::string outnam;
497  if(args.inputs_num < 2) {
498  outnam = inpnam + (args.raw_given ? ".raw" : ".prg");
499  } else {
500  outnam = args.inputs[1];
501  }
502  switch(args.algorithm_arg) {
503  case algorithm_arg_xipz:
504  ret = main_xipz(inpnam, outnam, args.raw_given, args.jump_arg);
505  break;
506  case algorithm_arg_qadz:
507  ret = main_qadz(inpnam, outnam, args.raw_given, args.jump_arg);
508  break;
509  case algorithm__NULL:
510  throw std::logic_error("algorithm vanished");
511  }
512  }
513  catch(const std::exception &e) {
514  std::cerr << "Exception: " << e.what() << std::endl;
515  ret = -1;
516  }
517  }
518  return ret;
519 }
POS_OF_BEGIN_OF_CDATA
#define POS_OF_BEGIN_OF_CDATA
Position in the stub where the beginning of the compressed data is stored.
Definition: xipz.cc:44
qadz.hh
Crunching and writing stub for qadz.
output_64_common
void output_64_common(const HistorgramArray &shisto)
Definition: xipz.cc:375
crunch_qadz
std::vector< uint8_t > crunch_qadz(const Data &data)
Definition: qadz.cc:114
calc_comp
float calc_comp(int n, const Data &data, const HistorgramArray &histarr)
calculate compressed bytes
Definition: xipz.cc:199
cmdline_parser
int cmdline_parser(int argc, char **argv, struct gengetopt_args_info *args_info)
Definition: cmdline.c:326
Bits::Bits
Bits()
Default Constructor.
Definition: xipz.cc:65
Bits::data
unsigned data
Storage area for the bits.
Definition: xipz.cc:61
CompressionBits
std::array< Bits, 256 > CompressionBits
Convenience type definition for an array of 256 byte counts.
Definition: xipz.cc:112
Data
Input data type.
Definition: data.hh:20
POS_OF_STOPREADING
#define POS_OF_STOPREADING
Position in the stub where the high byte of the destination address is stored at which the decompress...
Definition: xipz.cc:52
Data::size
std::vector< uint8_t >::size_type size() const
get data size
Definition: data.hh:54
gengetopt_args_info::algorithm_arg
enum enum_algorithm algorithm_arg
crunching algorithm to use (default='xipz').
Definition: cmdline.h:46
POS_OF_DEST
#define POS_OF_DEST
Position in the stub where the address is stored at which the data is decompressed.
Definition: xipz.cc:48
create_compression_bits
CompressionBits create_compression_bits(const HistorgramArray &compressable, int n)
create compression table
Definition: xipz.cc:225
POS_OF_N
#define POS_OF_N
Position in the stub where the end of the number of bits for the compression is stored.
Definition: xipz.cc:46
POS_OF_JMP
#define POS_OF_JMP
Position in the stub where the address is stored at which the final JMP is performed.
Definition: xipz.cc:50
write_compression_table
std::ostream & write_compression_table(std::ostream &out, int n, const HistorgramArray &histe)
write compression table
Definition: xipz.cc:295
Bits::Bits
Bits(unsigned data_, unsigned n)
Definition: xipz.cc:71
Bits
Definition: xipz.cc:60
HistEntry
Histogram entry.
Definition: xipz.cc:96
write_stub
std::ostream & write_stub(std::ostream &out, int n, uint16_t size, uint16_t loadaddr, uint16_t jmp)
write the decrunch stub
Definition: xipz.cc:253
Data::get_loadaddr
uint16_t get_loadaddr() const
get load address of data
Definition: data.hh:47
POS_OF_END_OF_CDATA
#define POS_OF_END_OF_CDATA
Position in the stub where the end of the compressed data is stored.
Definition: xipz.cc:42
choose_optimal_n
int choose_optimal_n(const Data &data, const HistorgramArray &shisto)
choose optimal number of bits
Definition: xipz.cc:400
gengetopt_args_info::inputs
char ** inputs
unnamed options (options without names)
Definition: cmdline.h:59
calc_histo
HistorgramArray calc_histo(const Data &data)
calculate the histogram
Definition: xipz.cc:158
gengetopt_args_info::raw_given
unsigned int raw_given
Whether raw was given.
Definition: cmdline.h:55
Bits::bits
unsigned bits
Number of bits stored.
Definition: xipz.cc:62
main_xipz
int main_xipz(const std::string &inputname, const std::string &outputname, bool raw, int jump)
main function using xip
Definition: xipz.cc:425
gengetopt_args_info::inputs_num
unsigned inputs_num
unnamed options number
Definition: cmdline.h:60
cmdline.h
The header file for the command line option parser generated by GNU Gengetopt version 2....
write_qadz_stub
std::ostream & write_qadz_stub(std::ostream &out, uint16_t size, uint16_t loadaddr, uint16_t jmp)
write the decrunch stub
Definition: qadz.cc:72
gengetopt_args_info::jump_arg
int jump_arg
address to jump to (-1 = load address) (default='-1').
Definition: cmdline.h:49
create_compressed_data
std::vector< uint8_t > create_compressed_data(const Data &data, const CompressionBits &compbits)
Create compressed data.
Definition: xipz.cc:333
data.hh
Binary data handling.
read_data
Data read_data(const std::string &fname)
Read data from a file.
Definition: xipz.cc:123
HistorgramArray
std::vector< HistEntry > HistorgramArray
Convenience type for a histogram array.
Definition: xipz.cc:109
gengetopt_args_info
Where the command line options are stored.
Definition: cmdline.h:40
write_compressed_data
std::ostream & write_compressed_data(std::ostream &out, const std::vector< uint8_t > &data)
write the compressed data
Definition: xipz.cc:312
main
int main(int argc, char **argv)
main function using xip
Definition: xipz.cc:482
main_qadz
int main_qadz(const std::string &inputname, const std::string &outputname, bool raw, int jump)
main function using qadz
Definition: xipz.cc:457
CMDLINE_PARSER_VERSION
#define CMDLINE_PARSER_VERSION
the program version
Definition: cmdline.h:34
operator<<
std::ostream & operator<<(std::ostream &o, const Bits &x)
Output operator for Bits.
Definition: xipz.cc:80