-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathNe2x.c
16139 lines (15134 loc) · 614 KB
/
Ne2x.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// Modifications:
// July 16, 2012: replace extension "out" by "txt"
// Since releasing version 2.01:
// -----------------------------
// In April 2014:
// * Change PrtPop/PrtMonoLoc so that when printing non-polymorphic loci,
// include popID (variable for sample ID) if only temporal methods are run.
// * Fix an error in printing jackknife confidence intervals for temporal
// methods (a typo error which causes the lower bound to print "infinite"
// when the upper bound is).
#include <time.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <ctype.h>
#include <math.h>
//#define INFINITE (float) 9999999
//#define EPSILON (float) 0.0000001 // used to compare a number with zero
#define INFINITE (float) 10E10
#define EPSILON (float) 10E-10 // used to compare a number with zero
#define MAXDEG 2000000000 // 2 billion, maximum for degree of freedom.
#define MAXLONG 4294967295LU // maximum for unsigned long.
// (For signed long type in 32-bit machine, which has 4 bytes, or 32 bits,
// the largest integer is 2,147,483,647 = 2^{31}-1.
// For unsigned long, largest is 2^{32}-1 = 4,294,967,295)
// When a number, which should be 0 from a calculation but may become nonzero
// by rounding-off error, is used to compare with zero in a condition
// statement, we may use EPSILON instead of zero
#define LEN_LOCUS 10 // max chars for locus names
#define POP_TEMP 20 // chars for population names in temporal method
// used for printing to output (can be adjusted!)
#define LEN_BLOCK 30
// LEN_BLOCK = max chars to record when reading a data block from input
// should be enough to distinguish the names of two populations.
#define GENLEN 10 // maximum length for a genotype
#define LENDIR 250 // max chars for a folder (directory) name
// (maybe 256 is maximum for Windows?)
#define LENFILE 60 // max chars for a file name
// FILENAME_MAX is the constant in C
#define PATHFILE LENDIR+LENFILE
#define FSTAT 1 // for FSTAT format
#define GENPOP 2 // for GENEPOP format
#define FREQUENCY 3 // for frequency format (never used)
#define MINFORM 1 // this should be the minimum of all formats
#define MAXFORM 2 // this should be the maximum of all formats
#define MAXCRIT 10 // maximum number of critical values
#define NCUT_SET 4 // default number of critical values.
#define MAX_SAMP 1000000 // maximum number of samples
#define MAX_POP 1000000 // maximum number of pops
#define MATING 0 // default mating model: 0 for random, 1 for monogamy
#define TABX 0 // default for having tab between columns in tabular-format outputs
// Since the files for Locus data and Burrows coefficients are large,
// especially the latter one, so we limit them:
#define MAXLOCPOP 50 // maximum populations to output in Locus Data file
#define MAXBURRPOP 50 // maximum populations to output in Burrows coef. file
// These are for reading strings
#define WHITESPACE " \t\f\r\v\n" // when skipping chars using WHITESPACE
#define CHARSKIP " ,\t\f\r\v\n" // or CHARSKIP, allow going new lines
#define PATHCHR "\\/" // characters that may represent path,
// In Windoes, backslash, in Unix, slash
#define STOPCHAR ",\n"
//#define BLANKS " \t" // not allow jumping lines for next char
#define BLANKS " \t\f\r\v" // not allow jumping lines for next char
#define SPECHR '*'
#define ENDCHRS "*\n\f\r\v" // special char added to new-line char to stop
// collecting chars for a string
#define XCHRSTOP " *,\t\f\r\v\n" // add to CHARSKIP, SPECHR
#define XWHITESTOP " *\t\f\r\v\n" // add to WHITESPACE, SPECHR
#define MERGE 0 // when true, jackknife CI includes parameter CI.
#define XFILSUFLD "xLD.txt" // append to main file for LD xtra output
#define XFILSUFHET "xHt.txt" // append to main file for Het xtra
#define XFILSUFCOAN "xCn.txt" // append to main file for Coan xtra
#define XFILSUFTEMP "xTp.txt" // append to main file for Temporal xtra
#define EXTENSION ".txt" // for adding to the prefix of main output file
// The next 3 are default values for LD method, Heterozygote excess method,
// and Nomura method. Set = 1 if Yes, 0 if No.
#define LDACTION 1
#define HETACTION 1
#define COANACTION 1
#define TEMPACTION 1
#define REWEIGH 0 // set = 0 if only weigh sample size when there are
// missing data (affected methods: LD, Pollak)
// Can remove this constant and replace REWEIGH
// by 0 from RunPop0
#define MAXMETHOD 15 // if there are n methods available, this should
// be set as 2^n-1
#define MAXGENERATION 150 // maximum of generation in temporal method
#define LOCOUTPUT 100 // max # of loci to print details in LocData output
#define LOCBURR 100000 // max # of loci to print in Burrows output
#define BURRSHORT 1 // set = 1 for printing summarized r^2-values for each
// locus pair, set = 0 for all allele pairs in locus pairs
// The next 2 constants (set = 0 as negation) only take effect
// when Burrows outputted in multi files
#define NONAMEBUR 1 // set = 1 for not using input file name as prefix,
#define NOEXPLAIN 1 // set = 1 for not printing out explanations
// (only the headers for columns of values are printed)
#define USETMP 1 // set = 1 to use temporary files when possible (At LD)
// set = 0 then use arrays instead
// for Nomura's method:
#define NONSIBOUT 0 // maximum putative nonsib pairs outputted in outLoc
#define LOCPERLINE 10 // maximum loci per line
#define LOCLIM 100
#define DETAILTEMP 0 // set = 1 if want to have more details in file OUTLOCNAME
#define MAXJACKLD 100000 // maximum number of polymorphic loci in the run
// to allow calculating jackknife CI in LD
// added in July 2016:
#define MINSAMP 3 // minimum number of individuals for jackknife on samples
#define RESETNE 1 // if set = 0, then r^2, exp(r^2), Ne in LD will NOT be
// recalculated when there are missing data. The purpose is to
// produce r^2 on outputs for checking. Normally, set = 1.
// Nov 2016: add this to notify if a Pcrit <= this, it would mean running
// by excluding only singleton alleles.
// Also, in function CritValRead, it will convert Pcrit = 1 into this value,
// so that the user, instead of not knowing which smapll value is
// smaller than this constant, then just enter value "1" instead.
#define PCRITX 10E-8
#define NOSNGL "No S*"
// add in Mar 2015
// -------------------------------------------------------------------------
struct locusMap
{
int num;
char name [LEN_LOCUS];
char chromo[LEN_LOCUS];
};
struct chromosome
{
char name [LEN_LOCUS]; // name of the chromosome
int nloci; // number of loci in the chromosome
int *locus; // loci in the chromosome: for i = 0, ..., (nloci-1),
}; // with p = locus[i], then the chromosome contains the
// (p+1)th locus read in the input file
// -------------------------------------------------------------------------
typedef struct allele *ALLEPTR;
struct allele
{
int mValue; // Allele mobility value
int copy;
int homozyg;
float freq;
float hetx;
ALLEPTR next;
};
typedef struct fish *FISHPTR;
struct fish
{
int gene[2];
FISHPTR next;
};
// for Nomura's method:
typedef struct nonsib *NONSIBPTR;
struct nonsib
{
int first;
int second;
// int simVal; // this field is not needed
NONSIBPTR next;
};
// the following is for storing estimate of molecular coancestry
// and the weight of allele frequencies in each locus
typedef struct molecoef *COANPTR;
struct molecoef
{
int locus; // this makes the search easier since this structure
// is used for only loci that are considered (maybe < nloci)
float fresq; // for sum of squares of frequencies
float scoan;
float diffcoan;
float weight; // used for storing weight at locus. The weight
// at this locus is the product of this field and (1-scoan).
COANPTR next;
};
// This is for temporal method
typedef struct timefreq *FREQPTR;
struct timefreq
{
int mValue; // Allele mobility value
int *samples; // number of samples, in different generations
float *freqs; // freq. of alleles, in different generations
FREQPTR next;
};
typedef struct age *AGEPTR;
struct age
{
float year;
AGEPTR next;
};
// ------------------------------------------------------------------------
// Function Prototypes: Only list those used by main.
// For the rest, functions called by others should be listed first.
// ------------------------------------------------------------------------
// From GetData
// --------------------------------------------------------------------------
int strcmp0 (char str1[], char str2[]);
int RunDirect (char misFilSuf[]);
int RunMultiFiles (char *mFileName, char mOpt);
int RunMultiCommon (char *mFileName);
int RunOption (char misFilSuf[], char LocSuf[], char BurSuf[],
char hasOpt, char rem, char *FileOne, char *FileTwo);
int main(int argc, char *argv[])
{
/* We may or may not need to know current directory when the executable
// run, but just in case, try this:
// To get current directory, include the following:
#include <direct.h>
// in Unix, may be:
//#include <unistd.h>
char currdir[100]="\0";
// Then function
getcwd (currdir, sizeof(currdir));
// will give current directory
printf ("curr dir = [%s]\n", currdir);
// If run under workbench,
// argv[0] will give the whole executable name including path name,
// but when run under command line, by typing the name of the program,
// argv[0] is the executable name only (what was typed).
// The Interface should issue a command line including info directive,
// option directive, or the multiple data input files.
// They are preceded by i:, o:, m:, m+:, respectively.
//*/
char *misFilSuf = "NoDat.txt";
// added to prefix of output = missing data file name
char *LocSuf = "Loc.txt";
// added to prefix of output = Locus Data output file name
char *BurSuf = "Bur.txt";
// added to prefix of output = Burrow Coefs output file name
char *FileOne, *FileTwo;
int n, p;
char c;
char mOpt;
char hasOpt;
char rem = 0;
FileOne = (char *) malloc(PATHFILE * sizeof(char));
*FileOne = '\0';
FileTwo = (char *) malloc(PATHFILE * sizeof(char));
*FileTwo = '\0';
//-----------------------------------------------------------------------
mOpt = 0;
if (argc < 2) {
n = RunDirect (misFilSuf);
if (n > 1) printf ("*** Number of runs = %d ***\n", n);
} // end of run from command line without argument.
else {
// run from the command line where arguments were given from the user
// Each string (besides the name of this program) should start by
// either 'm', 'm+', 'c', 'i', or 'o'. The next char must be a colon ':';
// otherwise, the program will stop. The string after those characters
// should be the name of a directive file for the program to read.
// The first directive file must be preceded by one of the following
// * 'm': for the file that contains list of (multiple) input files
// and some popular options.
// * 'm+': for the file that contains list of (multiple) input files
// and more options.
// * 'c': for the file that contains common settings followed by list
// of input files
// * 'i': for the file containing input and output file name.
// The 'i' file can be followed by a second directive file preceded by
// * 'o': This 'o' file is to supplement optiions for 'i'-file.
n = strlen (argv[1]);
// error messages on screen (no file or not preceded by appropriate chars):
c = argv[1][0];
if ((n <= 2) || (c != 'i' && c != 'm' && c!= 'c')) {
printf ("Illegal argument!\n");
exit (0);
};
if (c == 'i' && argv[1][1] != ':') {
printf ("Illegal argument!\n");
exit (0);
};
if (c == 'c' && argv[1][1] != ':') {
printf ("Illegal argument!\n");
exit (0);
};
if (c == 'm') {
if (argv[1][1] == '+') {
if (n == 3) {
printf ("Illegal argument!\n");
exit (0);
};
if (argv[1][2] != ':') {
printf ("Illegal argument!\n");
exit (0);
};
mOpt = 1; // run multiple files with more parameters
} else {
if (argv[1][1] != ':') {
printf ("Illegal argument!\n");
exit (0);
};
// things are OK: run multiple files, with less parameters
};
};
// assign FileOne as the name of the first control file to be used
for (p=0; p<n; p++) {
if (mOpt == 1) *(FileOne +p) = argv[1][3+p];
else *(FileOne +p) = argv[1][2+p];
};
*(FileOne+n) = '\0';
if (argv[1][0] == 'm') {
// "rm" stands for remove, to remove this file
if (argc > 2) rem = (strcmp0 (argv[2], "rm") == 0) ? 1 : 0;
n = RunMultiFiles (FileOne, mOpt);
printf ("\n*** Number of data files = %d ***\n", n);
if (rem == 1) remove (FileOne);
exit (0);
};
if (argv[1][0] == 'c') {
// "rm" stands for remove, to remove this file
if (argc > 2) rem = (strcmp0 (argv[2], "rm") == 0) ? 1 : 0;
n = RunMultiCommon (FileOne);
printf ("\n*** Number of data files = %d ***\n", n);
if (rem == 1) remove (FileOne);
exit (0);
};
if (argv[1][0] == 'i') {
// first directive file is info on input and output files.
hasOpt = 0;
if (argc > 2) {
hasOpt = (argv[2][0] == 'o' && argv[2][1] == ':')? 1: 0;
// "rm" stands for remove, to remove those "i", "o" files
rem = (strcmp (argv[2], "rm") == 0) ? 1 : 0;
if ((hasOpt == 1) && argc > 3)
rem = (strcmp (argv[3], "rm") == 0)? 1: 0;
};
if (hasOpt == 1) {
// FileTwo is assigned to be the name of option file:
n = strlen (argv[2]);
if (n>2) for (p=0; p<n; p++) *(FileTwo +p) =argv[2][2+p];
*(FileTwo+n) = '\0';
};
// FileOne is the name of info directive file
// FileTwo is the name of option directive file
RunOption (misFilSuf, LocSuf, BurSuf, hasOpt, rem,
FileOne, FileTwo);
}; // end of "if (argv[1][0] == 'i')"
}; // end of "if (argc < 2) else .. "
return 0;
}
//--------------------------------------------------------------------------
// Tools
// -----------------------------
// ------------------------------------------------------------------------
char BinaryDigit (int m, char position)
{
// Suppose m is written as binary, e.g., if m = 9, then m = 1001.
// This function value 0 or 1 as the value of the digit at "position."
// Thus,
// BinaryDigit(9,1)=BinaryDigit(9,3)=1, BinaryDigit(9,2)=BinaryDigit(9,3)=0.
// Algorithm: Write
// m = a(1)2^0 + a(2)2^1 + ... a(j)2^(j-1) + ... + a(k)2^(k-1),
// where a(j) = 0 or 1 for j <= k, a(k) = 1, and a(j) = 0 for j > k.
// The function will return a(j), where j = "position." The index k is
// the smallest integer such that 2^k > m.
// Thus, if "position" > k, the return value is 0.
// If position = k, then the return value a(k) is 1, otherwise, write
// m - 2^(k-1)= a(1)2^0 + a(2)2^1 + ... a(j)2^(j-1) + ... ,
// that is, we replace m by m - 2^(k-1) and then use recursive method.
// This function is used to determine which methods to run.
// With m as input, and parameter "position" as the method, the program
// will read m, and if the value of the function is 1, the method
// associated with "position" will be run.
// m=1: LD (Linkage Disequilibrium) method, (associated with position = 1)
// m=2: Heterozygote method, (associated with position = 2)
// m=4: Molecular Coancestry method (associated with position = 3)
// m=8: Temporal method (associated with position = 4)
// Any sum of those will include the methods represented by the terms.
// For example,
// m=3 = 1+2: both LD and Het methods,
// m=5 = 1+4: both LD and Nomura methods.
// In functions where BinaryDigit is called, input m will be
// assumed to be > 0 and less than 2^(number of methods), which
// is given by constant MAXMETHOD. Thus, BinaryDigit won't be called
// with m being out of this range.
int k;
int twopwk, twopwk1; // twopwk=2^k, twopwk1=2^{k-1}
if (m <= 0) return 0; // negative m is seen as 0. Actually, we
// only need return value when m=0 to make recursive work.
else {
twopwk1 = twopwk = 1;
for (k = 0; twopwk <= m; k++, twopwk1 = twopwk, twopwk *= 2);
// now twopwk = 2^k > m and twopwk1 = 2^(k-1) <= m
if (k < position) return 0; // that is, 2^(position-1) > m
else {
if (k == position) return 1;
return BinaryDigit (m-twopwk1, position);
};
};
}
// ------------------------------------------------------------------------
// Compare two strings, case-insensitive; return 0 if they are equal.
// This is case-insensitive vrsion of strcmp function.
int strcmp0 (char str1[], char str2[])
{
int i, j;
int m = strlen (str1);
int n = strlen (str2);
int c1, c2;
for (i=0, j=0; i<m && j<n; i++, j++) {
c1 = tolower(str1[i]); c2 = tolower(str2[j]);
if (c1 != c2) return (c1-c2);
};
// Out of the loop, corresponding characters are equal, case insensitive.
// need now to compare the lengths:
return m-n;
}
// --------------------------------------------------------------------------
int StopSign (char c, char stops[])
{
int len = strlen(stops);
int i;
for (i=0; i < len; i++)
if (c == stops[i]) return 1;
return 0;
}
// --------------------------------------------------------------------------
void Reverse (char str[], int start, int stop)
{
// Reverse string str between positions start and stop.
// Suppose str = "abcdefghijk", start=2 (str[2]='c'), stop=9 (str[9]='j'),
// output should be str = "abjihgfedck": the middle "cdefghij" is reversed.
int k = (stop+1-start)/2;
// if start = stop, then k = 0: next loop is skipped
int i;
char c;
for (i=0; i<k; i++) {
c = str[start+i], str[start+i] = str[stop-i], str[stop-i] = c;
};
}
// --------------------------------------------------------------------------
/* This will not be used:
int SplitName (char *fileName, char *prefix, char *ext,
int lenPre, int lenExt)
// split fileName into two parts: prefix and extension.
// For prefix, take up to lenPre rightmost chars (no slash "\",
// i.e., no folder name). For ext, take up to lenExt rightmost chars.
// Return the length of the prefix.
// (The extension will be in lower case)
{
char c;
int i, j, n, stop;
int dot = 0;
i = strlen(fileName);
// ignore trailing BLANKS:
for (j=i-1; j>=0 && StopSign(*(fileName+j), BLANKS)==1; j--);
*(fileName+ j+1)='\0';
// extract prefix and extension of the file. Extension part is the
// string after dot '.'. Note that there may be several dots in the
// file name, only the string after the last dot is the extension,
// so we need to traverse backward to detect the last one.
stop = j; // the index of the last visible character in the file name.
// count number of dots
for (n=0, i=0; i<=stop; i++) if (*(fileName + i) == '.') n++;
// so n is at least 1 if there is a dot in the name
if (n == 0) dot = 1; // no extension
for (n=stop, i=0, j=0; n>=0; n--) {
c = *(fileName + n);
if (c == '\\') break; // no path name is included in prefix
if (dot == 0 && c!= '.' && i<lenExt) *(ext + i++) = tolower(c);
if (dot > 0 && j<lenPre) *(prefix + j++) = c;
if (c == '.') dot++;
};
*(ext+i)='\0';
*(prefix+j)='\0';
Reverse (ext, 0, i-1);
Reverse (prefix, 0, j-1);
return j;
}
*/
// --------------------------------------------------------------------------
int GetPrefix (char *fileName, char *prefix, int lenPre, char path[])
// Get prefix of fileName, take up to lenPre rightmost chars
// starting after char path. If path ='\', that means no folder name
// included in prefix. Return the length of prefix.
{
char c;
int i, j, n, stop;
int dot = 0;
i = strlen(fileName);
// ignore trailing BLANKS:
for (j=i-1; j>=0 && StopSign(*(fileName+j), BLANKS)==1; j--);
*(fileName+ j+1)='\0';
// prefix ends at the last dot '.'. Note that there may be several dots
// in the file name, prefix ends at the last dot.
stop = j; // the index of the last visible character in the file name.
// count number of dots
for (n=0, i=0; i<=stop; i++) if (*(fileName + i) == '.') n++;
// so n is at least 1 if there is a dot in the name
if (n == 0) dot = 1;
for (n=stop, i=0, j=0; n>=0; n--) {
c = *(fileName + n);
// no path name should be included in prefix
if (StopSign(c, path)==1) break;
// if (c == path) break;
if (dot > 0 && j<lenPre) *(prefix + j++) = c;
if (c == '.') dot++;
};
// after the previous for loop, prefix is the string read backward
// from the end of fileName, until it reaches character path.
// add in Oct 21, 2011: remove blanks at the end of string prefix,
for (i=j-1; i>=0 && StopSign(*(fileName+i), BLANKS)==1; i--, j--);
*(prefix+j)='\0';
Reverse (prefix, 0, j-1);
return j;
}
// --------------------------------------------------------------------------
int GetToken (FILE *input, char *token, int maxlen, char skips[],
char stops[], int *lastc, int *empty)
// read input, grab the first block of data of "acceptable" characters
// (characters that are not in "stops" list, after skipping over
// leading characters in "skips" list), then
// put it as token, return its length, not to exceed maxlen, i.e.,
// all acceptable characters after this will be ignored. The "lastc"
// serves as the last character in the search, it's not in token.
// After grabbing token, we are still on the line containing it
// unless token is empty.
// The parameter *empty will have value equal to the number of characters
// in "skip" being trimmed from the right before returning "token".
// This value will give information on the number of characters on the
// line, from the last character of "token" to where the reading stops.
{
char c;
int i = 0;
*empty = 0;
for (; (c=fgetc(input)) != EOF && StopSign(c, skips)==1;);
if (c != EOF && (StopSign(c, stops)==0)) {
*token = c;
// go through all acceptable chars until reaching a char in "stops",
// but only record up to maxlen.
for (i=1; (*lastc=c=fgetc(input))!=EOF && (StopSign(c, stops)==0); )
if (i < maxlen-1) (*(token+ i++)) = c;
// don't want to go to a new line at the end of this function
// when we already obtain a nonempty token:
if (c == '\n') ungetc (c, input);
// the last char c is in "stops". If it happens that preceding
// chars are in "skips", i.e., token has trailing characters
// in "skips", we eliminate them by moving the null terminated
// character to the last non-skip; increment "empty" so that "empty"
// will be the number of chars in token being eliminated
for (; (StopSign (*(token+ i-1), skips)==1); i--, (*empty)++);
// terminate string here:
*(token+ i) ='\0';
} else {
// if getting nothing (token is empty), we may have gone to next line
// if either skips or stops contains new line char.
*token = '\0';
*lastc = c;
};
return i;
}
// --------------------------------------------------------------------------
int Value (char *data)
// convert string of digits in to an integer, only accept digits, no sign.
// If any nondigit is in the string data, return -1.
{
int i;
int n=0;
int k = strlen(data);
for (i=0; i<k; i++)
if (!isdigit(*(data+i))) return -1;
for (i=0; i<k; n=10*n+(*(data+i)-'0'), i++);
return n;
}
// --------------------------------------------------------------------------
int GetClues (FILE *input, int clueVal[], int nClue, int newline)
{
// Read a line from input, for a maximum of nClue tokens, to get values
// from the tokens read, for array clueVal of nonnegative integers.
// Stop at an invalid one (contains non-digit, e.g., +, -, or a comma)
// or at SPECHR, or at the end of line.
// The difference between SPECHR and other non-digit characters is that
// if a token consists of a valid number immediately followed by
// SPECHR, which is in XWHITESTOP, then SPECHR is not part of the
// token returned by GetToken, so it is still a valid integer by the
// call of Value.
// Return the number of valid integers, maximum is then nClue.
// Go to a new line if one of the following occurs:
// 1. SPECHR is reached (it will not be part of the token read even it
// is adjacent on the right of the token on the line)
// 2. Attempt to read a token will have to pass a line.
// 3. A token containing non-digit (determined by function Value).
// 4. Parameter newline was set non-zero at the call.
// In the first 3 cases, the return value is less than nClue.
// When the return value is nClue, new line is obtained solely
// on the value of newline.
int i, k, c, m, n;
char *token;
if (nClue <= 0) return 0;
token = (char*) malloc(sizeof(char)*10);
m = newline;
for (i = 0; i < nClue; ) {
// use BLANKS so that no attempt to go to next line to get
// a non-blank char. If the token obtained by GetToken is empty,
// then either it ends by SPECHR, or cursor is already on next line
// if (i == 0) // WHITESPACE has '\n' added to BLANKS. Use it to allow
// to skip empty lines for the first entry.
// m = GetToken(input, token, 10, WHITESPACE, XWHITESTOP, &c, &n);
// else
m = GetToken(input, token, 10, BLANKS, XWHITESTOP, &c, &n);
//printf("Token = %s, length = %d\n", token, m);
if (m <= 0) { // empty token
// if (c == SPECHR) m = 1;
if (c != '\n') m = 1;
break;
};
if ((k = Value(token)) < 0) break;
// token contains some non-digit, and m > 0,
// cursor will be put to the next line by code below
clueVal[i++] = k;
m = newline;
if (c == SPECHR) break; // stop reading when stopped by SPECHR
// (When token stopped by a char in XWHITESTOP, unless it is
// end-of-line char, the cursor is put behind that stopping char,
// so, another value may be obtained if it is after this SPECHR,
// therefore, we need to stop right here!
};
// note: if *newline > 0, then after the loop, m always > 0 except when
// token being empty and the cursor already put to next line.
// If the loop above did not go through its cycle (i.e., there is break),
// then the cursor will be on next line no matter the value of *newline,
// as being made sure by the code below.
// So, if the return value is less than nClue, new line is obtained.
// If return value = nClue, the last token read has length > 0, the
// function GetToken does not put cursor to next line
if (m > 0) for (; (c=fgetc(input)) != EOF && c!='\n';);
free(token);
return i;
}
// --------------------------------------------------------------------------
int GetPair (FILE *input, int *low, int *high, int newline)
{
int pair[2];
int k;
// default for low, high are taken as values assigned before calling
pair[0] = *low; pair[1] = *high;
k = GetClues (input, pair, 2, newline);
*low = pair[0];
*high = pair[1];
return k;
}
// --------------------------------------------------------------------------
int GetCluesF (FILE *input, float clueVal[], int nClue, int newline, int *last)
{
// Counterpart of GetClues, to get the array of float values instead.
// Condition 3 in the comments at GetClues is replaced by:
// 3. A token which is not a legitimate float value.
// In this function, a new return value for a parameter is added:
// Parameter *last = 1 when the reading ends with SPECHR immediately
// follows the last legitimate value, else *last = 0.
int i, c, d, m, n;
float f;
char *token = (char*) malloc(sizeof(char)*10);
*last = 0;
if (nClue <= 0) return 0;
d = 0;
m = newline;
for (i = 0; i < nClue; ) {
// if (i == 0) // WHITESPACE has '\n' added to BLANKS. Use it to allow
// to skip empty lines for the first entry.
// m = GetToken(input, token, 10, WHITESPACE, XWHITESTOP, &c, &n);
// else
m = GetToken(input, token, 10, BLANKS, XWHITESTOP, &c, &n);
if (m <= 0) { // empty token, cursor at SPECHR or passed the line
// if (c == SPECHR) m = 1;
if (c != '\n') m = 1;
break;
};
if (sscanf(token, "%f", &f) <= 0) break;
// now, f is a legitimate float value
d = c; // c is the character in XWHITESTOP that stops token
clueVal[i++] = f;
m = newline;
if (c == SPECHR) break; // stop reading when stopped by SPECHR
// (When token stopped by a char in XWHITESTOP, unless it is
// end-of-line char, the cursor is put behind that stopping char,
// so, another value may be obtained if it is after this SPECHR,
// therefore, we need to stop right here!
};
if (d == SPECHR) *last = 1;
if (m > 0) for (; (c=fgetc(input)) != EOF && c!='\n';);
free(token);
return i;
}
// --------------------------------------------------------------------------
int GetPairI (FILE *input, int *low, int *high, int newline)
{
// counterpart of GetPair, but allow negative values for low, high.
float pair[2];
int k;
int c;
pair[0] = (float) *low; pair[1] = (float) *high;
k = GetCluesF (input, pair, 2, newline, &c);
*low = (int) pair[0];
*high = (int) pair[1];
return k;
}
// --------------------------------------------------------------------------
int GetInt (FILE *input, int *value, int newline)
{
// similar to GetPairI, but for attempting one integer only.
int i, k, c, m, n;
char *token = (char*) malloc(sizeof(char)*10);
k = 0;
m = GetToken(input, token, 10, BLANKS, XWHITESTOP, &c, &n);
// WHITESPACE has '\n' added to BLANKS. Use it to allow to skip empty lines.
// m = GetToken(input, token, 10, WHITESPACE, XWHITESTOP, &c, &n);
if (m <= 0) {
// if (c == SPECHR) m = 1;
if (c != '\n') m = 1;
} else if (sscanf(token, "%d", &i) > 0) {
*value = i;
m = newline; // so that cursor still on the line if newline = 0
k = 1;
};
if (m > 0) for (; (c=fgetc(input)) != EOF && c!='\n';);
free(token);
return k;
}
// --------------------------------------------------------------------------
int GetRanges (FILE *inpFile, int ranges[], int size, int maxVal, char *byRange)
{
// Read ranges in pairs of nonnegative integers, to put in array "ranges"
// whose maximum size is "size" (which should be an even number), then
// return the number of valid ranges. Each pair read is a tentatvive range,
// and those tentative ranges are allowed to overlap.
// Use function GetClues to get a maximum of "size" numbers on the line.
// Let n be the true number of entries obtained by GetClues.
// If n = 0: no number given, so no limit.
// If n > 0 (at least one number read) and the first one is 0: no limit
// If n = 1: one number only. If it is 0, no limit as said above.
// If it is positive, then assume one range, from 1 to that number.
// If n is odd > 1, the last entry (without pairing) will be ignored.
// If n is even, we have nRanges = n/2 pairs.
// If there is an illegitimate pair, then the all entries after that pair
// will be ignored. (A legitimate pair should contain two nonzero numbers
// i, j with i <= j.) So, if the first pair is illegitimate, we assume no
// range is entered, hence we will take all possible values as "no limit"
// For no limit, we set nRanges = 1, ranges[0] = 1, ranges[1] = maxVal
int i, k, m, n, nRanges;
for (i=0; i<size; i++) ranges[i] = 0;
n = GetClues(inpFile, ranges, size, 1); // the last 1 is to put cursor
nRanges = n/2; // to the next line after reading entries for ranges
*byRange = 0;
if (ranges[0] == 0) { // (n=0) => *ranges=0 (default), so this covers n=0
nRanges = 1;
ranges[0] = 1;
ranges[1] = maxVal;
return nRanges;
} else if (n == 1) { // the case (*ranges=0) was excluded
nRanges = 1; // This is the case where there is only one entry,
ranges[1] = ranges[0]; // so it is assumed that there is one range,
ranges[0] = 1; // the lower bound is 1, the upper bound is
*byRange = 1; // that entry.
return nRanges;
};
// Now, n at least 2 and the first entry is > 0. If the second entry is
// smaller than the first, then the first pair is an illegitimate pair,
// we assume no limit as in (*ranges == 0):
if (ranges[1] < ranges[0]) {
nRanges = 1;
ranges[0] = 1;
ranges[1] = maxVal;
return nRanges;
};
// since loci will be limited by ranges specified here, set *byRange = 1:
*byRange = 1;
// The first pair is good. Next, if there is an illegitimate pairing
// after the first, then will ignore all from that pair.
for (k = 1; k < nRanges; k++) {
if (ranges[2*k] > ranges[2*k+1] || ranges[2*k] == 0)
break;
};
nRanges = k;
// The following lines of code are for perfection.
// The calling function RunMultiCommon can do OK without these.
//* -------------------------------------------------------------------
// Now combine pairings that either overlap or adjacent.
// Starting from the first range (k=0), look to see any subsequent ranges
// with common contents, or low-end of one range bigger than high-end
// of the other by exactly 1. Then combine them and set that subsequent
// range as empty by reassign the endpoints both to be 0. As we do this,
// we turn some nonempty ranges to empty. As we move along from k = 0,
// we will encounter such empty ranges and count them, using variable n.
// At the end of the next "for" loop, all ranges are disjoint and there
// are no empty ranges between them. They may not be in ascending order.
for (k=0, n=0; k+n < nRanges; k++) {
if (ranges[2*k] == 0) {
// if this Rk, the(k+1)st range, is empty, reassign it to be the next
// nonempty range, say S, and assign that range S to be empty.
// (In other words, swap them.)
n++; // since this range Rk is empty, increment n
for (i=k+1; i < nRanges; i++) if (ranges[2*i] > 0) break;
if (i < nRanges) {
ranges[2*k] = ranges[2*i];
ranges[2*k+1] = ranges[2*i+1];
ranges[2*i] = 0;
ranges[2*i+1] = 0;
};
};
// check all ranges after k for possible merging
for (i = k+1; i < nRanges; i++) {
if (ranges[2*i] == 0) continue;
// When low-end of range Ri is in Rk, or is equal to high-end
// of Ri plus 1, we combine Rk with Ri to obtain new Rk, and
// set Ri to be empty.
if (ranges[2*i] >= ranges[2*k] && ranges[2*i] <= ranges[2*k+1]+1)
{
if (ranges[2*i+1] > ranges[2*k+1])
ranges[2*k+1] = ranges[2*i+1];
ranges[2*i] = 0;
ranges[2*i+1] = 0;
continue;
};
// Similar to the above, with roles being swapped.
if (ranges[2*i] < ranges[2*k] && ranges[2*i+1] + 1 >= ranges[2*k])
{
ranges[2*k] = ranges[2*i];
if (ranges[2*i+1] > ranges[2*k+1])
ranges[2*k+1] = ranges[2*i+1];
ranges[2*i] = 0;
ranges[2*i+1] = 0;
continue;
};
}; // merge for range index k is done
};
nRanges -= n;
// Now, arrange so that the ranges in ascending order, nicer if outputted
for (k=0; k < nRanges; k++) {
m = ranges[2*k];
n = ranges[2*k + 1];
for (i = k+1; i < nRanges; i++) {
if (ranges[2*i] < m) { // swap Rk and Ri
ranges[2*k] = ranges[2*i];
ranges[2*k + 1] = ranges[2*i + 1];
ranges[2*i] = m;
ranges[2*i + 1] = n;
m = ranges[2*k];
n = ranges[2*k + 1];
};
};
};
// testing:
//printf ("Ranges: ");
//for (k=0; k<nRanges; k++) printf (" [%d, %d] ", ranges[2*k], ranges[2*k+1]);
//printf ("\n");
//*/ ------------------------------------------------------------------
return nRanges;
}
// --------------------------------------------------------------------------
int SetMethod (int m, char *mLD, char *mHet, char *mNomura, char *mTemporal)
{
// all methods to run if the input m is at least the max.
// No method if m <= 0.
int mCount = 0;
*mLD = 0;
*mHet = 0;
*mNomura = 0;
*mTemporal = 0;
if (m >= MAXMETHOD) {
*mLD = 1;
*mHet = 1;
*mNomura = 1;
*mTemporal = 1;
} else {
if (m > 0) { // Digits of m in binary
*mLD = BinaryDigit (m, 1); // at position 1
*mHet = BinaryDigit (m, 2); // at position 2
*mNomura = BinaryDigit (m, 3); // at position 3
*mTemporal = BinaryDigit (m, 4); // at position 4
};
};
if (*mLD == 1) mCount++;
if (*mHet == 1) mCount++;
if (*mNomura == 1) mCount++;
if (*mTemporal == 1) mCount++;
return mCount;
}
// --------------------------------------------------------------------------
// added in Nov 2014:
//
// Modified in Dec 2016
// Name for Burrows file according to spec. value cutoff by string "-S",
void GetBurrName(char *outBurrName, int popRead, float cutoff)
// create outBurrName string:
// outBurrName = prefix + "Pop" + popRead + "Bur" + cutoff + ".txt"
// Example: if prefix is empty, popRead = 5, cutoff = 0.02, then
// outBurrName = "Pop5Bur02.txt"
// (Starting after the dot, take cutoff value up to significant digits.)
{
int i, j;
char *ptr;
int len;
i = popRead;
char *cutoffstr = (char *) malloc(20 * sizeof(char));
// Dec 2016: with special cutoff for dropping singleton, use "-S" as suffix
if (cutoff > 0 && cutoff <= PCRITX) {
sprintf (cutoffstr, "%s", "-S");
*(cutoffstr+2) = '\0';
} else {
sprintf (cutoffstr, "%f", cutoff);
ptr = strchr(cutoffstr, '.');
// remove characters in cutoffstr up to the "dot"
if (ptr != NULL) cutoffstr = (ptr+1);
len = strlen(cutoffstr);
for (j=0, i=len-1; i >=0; i--) {
// starting from rightmost, j = number of zeros until seeing a non-zero
if (*(cutoffstr+i) == '0') j++;
else break;
}
len -= j;
if (len == 0) { // this is when cutoff = 0
*cutoffstr = '0';
len = 1;
}
*(cutoffstr+len) = '\0';
}
len = strlen(outBurrName);
sprintf (outBurrName+len, "%s%d%s%s%s",
"Pop", popRead, "Bur", cutoffstr, ".txt");
free(cutoffstr);
}